| /******************** GPUJIT.c - GPUJIT Execution Engine **********************/ |
| /* */ |
| /* The LLVM Compiler Infrastructure */ |
| /* */ |
| /* This file is dual licensed under the MIT and the University of Illinois */ |
| /* Open Source License. See LICENSE.TXT for details. */ |
| /* */ |
| /******************************************************************************/ |
| /* */ |
| /* This file implements GPUJIT, a ptx string execution engine for GPU. */ |
| /* */ |
| /******************************************************************************/ |
| |
| #include "GPUJIT.h" |
| |
| #ifdef HAS_LIBCUDART |
| #include <cuda.h> |
| #include <cuda_runtime.h> |
| #endif /* HAS_LIBCUDART */ |
| |
| #ifdef HAS_LIBOPENCL |
| #ifdef __APPLE__ |
| #include <OpenCL/opencl.h> |
| #else |
| #include <CL/cl.h> |
| #endif /* __APPLE__ */ |
| #endif /* HAS_LIBOPENCL */ |
| |
| #include <assert.h> |
| #include <dlfcn.h> |
| #include <stdarg.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| |
| static int DebugMode; |
| static int CacheMode; |
| #define max(x, y) ((x) > (y) ? (x) : (y)) |
| |
| static PollyGPURuntime Runtime = RUNTIME_NONE; |
| |
| static void debug_print(const char *format, ...) { |
| if (!DebugMode) |
| return; |
| |
| va_list args; |
| va_start(args, format); |
| vfprintf(stderr, format, args); |
| va_end(args); |
| } |
| #define dump_function() debug_print("-> %s\n", __func__) |
| |
| #define KERNEL_CACHE_SIZE 10 |
| |
| static void err_runtime() __attribute__((noreturn)); |
| static void err_runtime() { |
| fprintf(stderr, "Runtime not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| struct PollyGPUContextT { |
| void *Context; |
| }; |
| |
| struct PollyGPUFunctionT { |
| void *Kernel; |
| }; |
| |
| struct PollyGPUDevicePtrT { |
| void *DevicePtr; |
| }; |
| |
| /******************************************************************************/ |
| /* OpenCL */ |
| /******************************************************************************/ |
| #ifdef HAS_LIBOPENCL |
| |
| struct OpenCLContextT { |
| cl_context Context; |
| cl_command_queue CommandQueue; |
| }; |
| |
| struct OpenCLKernelT { |
| cl_kernel Kernel; |
| cl_program Program; |
| const char *BinaryString; |
| }; |
| |
| struct OpenCLDevicePtrT { |
| cl_mem MemObj; |
| }; |
| |
| /* Dynamic library handles for the OpenCL runtime library. */ |
| static void *HandleOpenCL; |
| static void *HandleOpenCLBeignet; |
| |
| /* Type-defines of function pointer to OpenCL Runtime API. */ |
| typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, |
| cl_platform_id *Platforms, |
| cl_uint *NumPlatforms); |
| static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; |
| |
| typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, |
| cl_device_type DeviceType, |
| cl_uint NumEntries, cl_device_id *Devices, |
| cl_uint *NumDevices); |
| static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; |
| |
| typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, |
| cl_device_info ParamName, |
| size_t ParamValueSize, void *ParamValue, |
| size_t *ParamValueSizeRet); |
| static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; |
| |
| typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, |
| size_t ParamValueSize, void *ParamValue, |
| size_t *ParamValueSizeRet); |
| static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; |
| |
| typedef cl_context clCreateContextFcnTy( |
| const cl_context_properties *Properties, cl_uint NumDevices, |
| const cl_device_id *Devices, |
| void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, |
| size_t CB, void *UserData), |
| void *UserData, cl_int *ErrcodeRet); |
| static clCreateContextFcnTy *clCreateContextFcnPtr; |
| |
| typedef cl_command_queue |
| clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, |
| cl_command_queue_properties Properties, |
| cl_int *ErrcodeRet); |
| static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; |
| |
| typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, |
| size_t Size, void *HostPtr, |
| cl_int *ErrcodeRet); |
| static clCreateBufferFcnTy *clCreateBufferFcnPtr; |
| |
| typedef cl_int |
| clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, |
| cl_bool BlockingWrite, size_t Offset, size_t Size, |
| const void *Ptr, cl_uint NumEventsInWaitList, |
| const cl_event *EventWaitList, cl_event *Event); |
| static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; |
| |
| typedef cl_program |
| clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, |
| const cl_device_id *DeviceList, |
| const char *Filename, cl_int *ErrcodeRet); |
| static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; |
| |
| typedef cl_program clCreateProgramWithBinaryFcnTy( |
| cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, |
| const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, |
| cl_int *ErrcodeRet); |
| static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; |
| |
| typedef cl_int clBuildProgramFcnTy( |
| cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, |
| const char *Options, |
| void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), |
| void *UserData); |
| static clBuildProgramFcnTy *clBuildProgramFcnPtr; |
| |
| typedef cl_kernel clCreateKernelFcnTy(cl_program Program, |
| const char *KernelName, |
| cl_int *ErrcodeRet); |
| static clCreateKernelFcnTy *clCreateKernelFcnPtr; |
| |
| typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, |
| size_t ArgSize, const void *ArgValue); |
| static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; |
| |
| typedef cl_int clEnqueueNDRangeKernelFcnTy( |
| cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, |
| const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, |
| const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, |
| const cl_event *EventWaitList, cl_event *Event); |
| static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; |
| |
| typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, |
| cl_mem Buffer, cl_bool BlockingRead, |
| size_t Offset, size_t Size, void *Ptr, |
| cl_uint NumEventsInWaitList, |
| const cl_event *EventWaitList, |
| cl_event *Event); |
| static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; |
| |
| typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); |
| static clFlushFcnTy *clFlushFcnPtr; |
| |
| typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); |
| static clFinishFcnTy *clFinishFcnPtr; |
| |
| typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); |
| static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; |
| |
| typedef cl_int clReleaseProgramFcnTy(cl_program Program); |
| static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; |
| |
| typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); |
| static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; |
| |
| typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); |
| static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; |
| |
| typedef cl_int clReleaseContextFcnTy(cl_context Context); |
| static clReleaseContextFcnTy *clReleaseContextFcnPtr; |
| |
| static void *getAPIHandleCL(void *Handle, const char *FuncName) { |
| char *Err; |
| void *FuncPtr; |
| dlerror(); |
| FuncPtr = dlsym(Handle, FuncName); |
| if ((Err = dlerror()) != 0) { |
| fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); |
| return 0; |
| } |
| return FuncPtr; |
| } |
| |
| static int initialDeviceAPILibrariesCL() { |
| HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); |
| HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); |
| if (!HandleOpenCL) { |
| fprintf(stderr, "Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| return 1; |
| } |
| |
| /* Get function pointer to OpenCL Runtime API. |
| * |
| * Note that compilers conforming to the ISO C standard are required to |
| * generate a warning if a conversion from a void * pointer to a function |
| * pointer is attempted as in the following statements. The warning |
| * of this kind of cast may not be emitted by clang and new versions of gcc |
| * as it is valid on POSIX 2008. For compilers required to generate a warning, |
| * we temporarily disable -Wpedantic, to avoid bloating the output with |
| * unnecessary warnings. |
| * |
| * Reference: |
| * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html |
| */ |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wpedantic" |
| static int initialDeviceAPIsCL() { |
| if (initialDeviceAPILibrariesCL() == 0) |
| return 0; |
| |
| // FIXME: We are now always selecting the Intel Beignet driver if it is |
| // available on the system, instead of a possible NVIDIA or AMD OpenCL |
| // API. This selection should occurr based on the target architecture |
| // chosen when compiling. |
| void *Handle = |
| (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL); |
| |
| clGetPlatformIDsFcnPtr = |
| (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs"); |
| |
| clGetDeviceIDsFcnPtr = |
| (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs"); |
| |
| clGetDeviceInfoFcnPtr = |
| (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo"); |
| |
| clGetKernelInfoFcnPtr = |
| (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo"); |
| |
| clCreateContextFcnPtr = |
| (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext"); |
| |
| clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( |
| Handle, "clCreateCommandQueue"); |
| |
| clCreateBufferFcnPtr = |
| (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer"); |
| |
| clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( |
| Handle, "clEnqueueWriteBuffer"); |
| |
| if (HandleOpenCLBeignet) |
| clCreateProgramWithLLVMIntelFcnPtr = |
| (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( |
| Handle, "clCreateProgramWithLLVMIntel"); |
| |
| clCreateProgramWithBinaryFcnPtr = |
| (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( |
| Handle, "clCreateProgramWithBinary"); |
| |
| clBuildProgramFcnPtr = |
| (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram"); |
| |
| clCreateKernelFcnPtr = |
| (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel"); |
| |
| clSetKernelArgFcnPtr = |
| (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg"); |
| |
| clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( |
| Handle, "clEnqueueNDRangeKernel"); |
| |
| clEnqueueReadBufferFcnPtr = |
| (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer"); |
| |
| clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush"); |
| |
| clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish"); |
| |
| clReleaseKernelFcnPtr = |
| (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel"); |
| |
| clReleaseProgramFcnPtr = |
| (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram"); |
| |
| clReleaseMemObjectFcnPtr = |
| (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject"); |
| |
| clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( |
| Handle, "clReleaseCommandQueue"); |
| |
| clReleaseContextFcnPtr = |
| (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext"); |
| |
| return 1; |
| } |
| #pragma GCC diagnostic pop |
| |
| /* Context and Device. */ |
| static PollyGPUContext *GlobalContext = NULL; |
| static cl_device_id GlobalDeviceID = NULL; |
| |
| /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ |
| static void printOpenCLError(int Error); |
| |
| static void checkOpenCLError(int Ret, const char *format, ...) { |
| if (Ret == CL_SUCCESS) |
| return; |
| |
| printOpenCLError(Ret); |
| va_list args; |
| va_start(args, format); |
| vfprintf(stderr, format, args); |
| va_end(args); |
| exit(-1); |
| } |
| |
| static PollyGPUContext *initContextCL() { |
| dump_function(); |
| |
| PollyGPUContext *Context; |
| |
| cl_platform_id PlatformID = NULL; |
| cl_device_id DeviceID = NULL; |
| cl_uint NumDevicesRet; |
| cl_int Ret; |
| |
| char DeviceRevision[256]; |
| char DeviceName[256]; |
| size_t DeviceRevisionRetSize, DeviceNameRetSize; |
| |
| static __thread PollyGPUContext *CurrentContext = NULL; |
| |
| if (CurrentContext) |
| return CurrentContext; |
| |
| /* Get API handles. */ |
| if (initialDeviceAPIsCL() == 0) { |
| fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); |
| exit(-1); |
| } |
| |
| /* Get number of devices that support OpenCL. */ |
| static const int NumberOfPlatforms = 1; |
| Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); |
| checkOpenCLError(Ret, "Failed to get platform IDs.\n"); |
| // TODO: Extend to CL_DEVICE_TYPE_ALL? |
| static const int NumberOfDevices = 1; |
| Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, |
| &DeviceID, &NumDevicesRet); |
| checkOpenCLError(Ret, "Failed to get device IDs.\n"); |
| |
| GlobalDeviceID = DeviceID; |
| if (NumDevicesRet == 0) { |
| fprintf(stderr, "There is no device supporting OpenCL.\n"); |
| exit(-1); |
| } |
| |
| /* Get device revision. */ |
| Ret = |
| clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), |
| DeviceRevision, &DeviceRevisionRetSize); |
| checkOpenCLError(Ret, "Failed to fetch device revision.\n"); |
| |
| /* Get device name. */ |
| Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), |
| DeviceName, &DeviceNameRetSize); |
| checkOpenCLError(Ret, "Failed to fetch device name.\n"); |
| |
| debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); |
| |
| /* Create context on the device. */ |
| Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); |
| if (Context == 0) { |
| fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); |
| exit(-1); |
| } |
| Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); |
| if (Context->Context == 0) { |
| fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); |
| exit(-1); |
| } |
| ((OpenCLContext *)Context->Context)->Context = |
| clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); |
| checkOpenCLError(Ret, "Failed to create context.\n"); |
| |
| static const int ExtraProperties = 0; |
| ((OpenCLContext *)Context->Context)->CommandQueue = |
| clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, |
| DeviceID, ExtraProperties, &Ret); |
| checkOpenCLError(Ret, "Failed to create command queue.\n"); |
| |
| if (CacheMode) |
| CurrentContext = Context; |
| |
| GlobalContext = Context; |
| return Context; |
| } |
| |
| static void freeKernelCL(PollyGPUFunction *Kernel) { |
| dump_function(); |
| |
| if (CacheMode) |
| return; |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| cl_int Ret; |
| Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); |
| checkOpenCLError(Ret, "Failed to flush command queue.\n"); |
| Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); |
| checkOpenCLError(Ret, "Failed to finish command queue.\n"); |
| |
| if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { |
| cl_int Ret = |
| clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); |
| checkOpenCLError(Ret, "Failed to release kernel.\n"); |
| } |
| |
| if (((OpenCLKernel *)Kernel->Kernel)->Program) { |
| cl_int Ret = |
| clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); |
| checkOpenCLError(Ret, "Failed to release program.\n"); |
| } |
| |
| if (Kernel->Kernel) |
| free((OpenCLKernel *)Kernel->Kernel); |
| |
| if (Kernel) |
| free(Kernel); |
| } |
| |
| static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, |
| const char *KernelName) { |
| dump_function(); |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; |
| static __thread int NextCacheItem = 0; |
| |
| for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { |
| // We exploit here the property that all Polly-ACC kernels are allocated |
| // as global constants, hence a pointer comparision is sufficient to |
| // determin equality. |
| if (KernelCache[i] && |
| ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == |
| BinaryBuffer) { |
| debug_print(" -> using cached kernel\n"); |
| return KernelCache[i]; |
| } |
| } |
| |
| PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); |
| if (Function == 0) { |
| fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); |
| exit(-1); |
| } |
| Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); |
| if (Function->Kernel == 0) { |
| fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); |
| exit(-1); |
| } |
| |
| if (!GlobalDeviceID) { |
| fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); |
| exit(-1); |
| } |
| |
| cl_int Ret; |
| |
| if (HandleOpenCLBeignet) { |
| // This is a workaround, since clCreateProgramWithLLVMIntel only |
| // accepts a filename to a valid llvm-ir file as an argument, instead |
| // of accepting the BinaryBuffer directly. |
| char FileName[] = "/tmp/polly_kernelXXXXXX"; |
| int File = mkstemp(FileName); |
| write(File, BinaryBuffer, strlen(BinaryBuffer)); |
| |
| ((OpenCLKernel *)Function->Kernel)->Program = |
| clCreateProgramWithLLVMIntelFcnPtr( |
| ((OpenCLContext *)GlobalContext->Context)->Context, 1, |
| &GlobalDeviceID, FileName, &Ret); |
| checkOpenCLError(Ret, "Failed to create program from llvm.\n"); |
| close(File); |
| unlink(FileName); |
| } else { |
| size_t BinarySize = strlen(BinaryBuffer); |
| ((OpenCLKernel *)Function->Kernel)->Program = |
| clCreateProgramWithBinaryFcnPtr( |
| ((OpenCLContext *)GlobalContext->Context)->Context, 1, |
| &GlobalDeviceID, (const size_t *)&BinarySize, |
| (const unsigned char **)&BinaryBuffer, NULL, &Ret); |
| checkOpenCLError(Ret, "Failed to create program from binary.\n"); |
| } |
| |
| Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, |
| &GlobalDeviceID, NULL, NULL, NULL); |
| checkOpenCLError(Ret, "Failed to build program.\n"); |
| |
| ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( |
| ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); |
| checkOpenCLError(Ret, "Failed to create kernel.\n"); |
| |
| ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; |
| |
| if (CacheMode) { |
| if (KernelCache[NextCacheItem]) |
| freeKernelCL(KernelCache[NextCacheItem]); |
| |
| KernelCache[NextCacheItem] = Function; |
| |
| NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; |
| } |
| |
| return Function; |
| } |
| |
| static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, |
| long MemSize) { |
| dump_function(); |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| cl_int Ret; |
| Ret = clEnqueueWriteBufferFcnPtr( |
| ((OpenCLContext *)GlobalContext->Context)->CommandQueue, |
| ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, |
| HostData, 0, NULL, NULL); |
| checkOpenCLError(Ret, "Copying data from host memory to device failed.\n"); |
| } |
| |
| static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, |
| long MemSize) { |
| dump_function(); |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| cl_int Ret; |
| Ret = clEnqueueReadBufferFcnPtr( |
| ((OpenCLContext *)GlobalContext->Context)->CommandQueue, |
| ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, |
| HostData, 0, NULL, NULL); |
| checkOpenCLError(Ret, "Copying results from device to host memory failed.\n"); |
| } |
| |
| static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, |
| unsigned int GridDimY, unsigned int BlockDimX, |
| unsigned int BlockDimY, unsigned int BlockDimZ, |
| void **Parameters) { |
| dump_function(); |
| |
| cl_int Ret; |
| cl_uint NumArgs; |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; |
| Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, |
| sizeof(cl_uint), &NumArgs, NULL); |
| checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); |
| |
| /* Argument sizes are stored at the end of the Parameters array. */ |
| for (cl_uint i = 0; i < NumArgs; i++) { |
| Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, |
| *((int *)Parameters[NumArgs + i]), |
| (void *)Parameters[i]); |
| checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); |
| } |
| |
| unsigned int GridDimZ = 1; |
| size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, |
| BlockDimZ * GridDimZ}; |
| size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; |
| |
| static const int WorkDim = 3; |
| OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; |
| Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, |
| WorkDim, NULL, GlobalWorkSize, |
| LocalWorkSize, 0, NULL, NULL); |
| checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); |
| } |
| |
| static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; |
| cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); |
| checkOpenCLError(Ret, "Failed to free device memory.\n"); |
| |
| free(DevPtr); |
| free(Allocation); |
| } |
| |
| static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { |
| dump_function(); |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); |
| if (DevData == 0) { |
| fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); |
| exit(-1); |
| } |
| DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); |
| if (DevData->DevicePtr == 0) { |
| fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); |
| exit(-1); |
| } |
| |
| cl_int Ret; |
| ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = |
| clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, |
| CL_MEM_READ_WRITE, MemSize, NULL, &Ret); |
| checkOpenCLError(Ret, |
| "Allocate memory for GPU device memory pointer failed.\n"); |
| |
| return DevData; |
| } |
| |
| static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; |
| return (void *)DevPtr->MemObj; |
| } |
| |
| static void synchronizeDeviceCL() { |
| dump_function(); |
| |
| if (!GlobalContext) { |
| fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); |
| exit(-1); |
| } |
| |
| if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) != |
| CL_SUCCESS) { |
| fprintf(stderr, "Synchronizing device and host memory failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| static void freeContextCL(PollyGPUContext *Context) { |
| dump_function(); |
| |
| cl_int Ret; |
| |
| GlobalContext = NULL; |
| |
| OpenCLContext *Ctx = (OpenCLContext *)Context->Context; |
| if (Ctx->CommandQueue) { |
| Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); |
| checkOpenCLError(Ret, "Could not release command queue.\n"); |
| } |
| |
| if (Ctx->Context) { |
| Ret = clReleaseContextFcnPtr(Ctx->Context); |
| checkOpenCLError(Ret, "Could not release context.\n"); |
| } |
| |
| free(Ctx); |
| free(Context); |
| } |
| |
| static void printOpenCLError(int Error) { |
| |
| switch (Error) { |
| case CL_SUCCESS: |
| // Success, don't print an error. |
| break; |
| |
| // JIT/Runtime errors. |
| case CL_DEVICE_NOT_FOUND: |
| fprintf(stderr, "Device not found.\n"); |
| break; |
| case CL_DEVICE_NOT_AVAILABLE: |
| fprintf(stderr, "Device not available.\n"); |
| break; |
| case CL_COMPILER_NOT_AVAILABLE: |
| fprintf(stderr, "Compiler not available.\n"); |
| break; |
| case CL_MEM_OBJECT_ALLOCATION_FAILURE: |
| fprintf(stderr, "Mem object allocation failure.\n"); |
| break; |
| case CL_OUT_OF_RESOURCES: |
| fprintf(stderr, "Out of resources.\n"); |
| break; |
| case CL_OUT_OF_HOST_MEMORY: |
| fprintf(stderr, "Out of host memory.\n"); |
| break; |
| case CL_PROFILING_INFO_NOT_AVAILABLE: |
| fprintf(stderr, "Profiling info not available.\n"); |
| break; |
| case CL_MEM_COPY_OVERLAP: |
| fprintf(stderr, "Mem copy overlap.\n"); |
| break; |
| case CL_IMAGE_FORMAT_MISMATCH: |
| fprintf(stderr, "Image format mismatch.\n"); |
| break; |
| case CL_IMAGE_FORMAT_NOT_SUPPORTED: |
| fprintf(stderr, "Image format not supported.\n"); |
| break; |
| case CL_BUILD_PROGRAM_FAILURE: |
| fprintf(stderr, "Build program failure.\n"); |
| break; |
| case CL_MAP_FAILURE: |
| fprintf(stderr, "Map failure.\n"); |
| break; |
| case CL_MISALIGNED_SUB_BUFFER_OFFSET: |
| fprintf(stderr, "Misaligned sub buffer offset.\n"); |
| break; |
| case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: |
| fprintf(stderr, "Exec status error for events in wait list.\n"); |
| break; |
| case CL_COMPILE_PROGRAM_FAILURE: |
| fprintf(stderr, "Compile program failure.\n"); |
| break; |
| case CL_LINKER_NOT_AVAILABLE: |
| fprintf(stderr, "Linker not available.\n"); |
| break; |
| case CL_LINK_PROGRAM_FAILURE: |
| fprintf(stderr, "Link program failure.\n"); |
| break; |
| case CL_DEVICE_PARTITION_FAILED: |
| fprintf(stderr, "Device partition failed.\n"); |
| break; |
| case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: |
| fprintf(stderr, "Kernel arg info not available.\n"); |
| break; |
| |
| // Compiler errors. |
| case CL_INVALID_VALUE: |
| fprintf(stderr, "Invalid value.\n"); |
| break; |
| case CL_INVALID_DEVICE_TYPE: |
| fprintf(stderr, "Invalid device type.\n"); |
| break; |
| case CL_INVALID_PLATFORM: |
| fprintf(stderr, "Invalid platform.\n"); |
| break; |
| case CL_INVALID_DEVICE: |
| fprintf(stderr, "Invalid device.\n"); |
| break; |
| case CL_INVALID_CONTEXT: |
| fprintf(stderr, "Invalid context.\n"); |
| break; |
| case CL_INVALID_QUEUE_PROPERTIES: |
| fprintf(stderr, "Invalid queue properties.\n"); |
| break; |
| case CL_INVALID_COMMAND_QUEUE: |
| fprintf(stderr, "Invalid command queue.\n"); |
| break; |
| case CL_INVALID_HOST_PTR: |
| fprintf(stderr, "Invalid host pointer.\n"); |
| break; |
| case CL_INVALID_MEM_OBJECT: |
| fprintf(stderr, "Invalid memory object.\n"); |
| break; |
| case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: |
| fprintf(stderr, "Invalid image format descriptor.\n"); |
| break; |
| case CL_INVALID_IMAGE_SIZE: |
| fprintf(stderr, "Invalid image size.\n"); |
| break; |
| case CL_INVALID_SAMPLER: |
| fprintf(stderr, "Invalid sampler.\n"); |
| break; |
| case CL_INVALID_BINARY: |
| fprintf(stderr, "Invalid binary.\n"); |
| break; |
| case CL_INVALID_BUILD_OPTIONS: |
| fprintf(stderr, "Invalid build options.\n"); |
| break; |
| case CL_INVALID_PROGRAM: |
| fprintf(stderr, "Invalid program.\n"); |
| break; |
| case CL_INVALID_PROGRAM_EXECUTABLE: |
| fprintf(stderr, "Invalid program executable.\n"); |
| break; |
| case CL_INVALID_KERNEL_NAME: |
| fprintf(stderr, "Invalid kernel name.\n"); |
| break; |
| case CL_INVALID_KERNEL_DEFINITION: |
| fprintf(stderr, "Invalid kernel definition.\n"); |
| break; |
| case CL_INVALID_KERNEL: |
| fprintf(stderr, "Invalid kernel.\n"); |
| break; |
| case CL_INVALID_ARG_INDEX: |
| fprintf(stderr, "Invalid arg index.\n"); |
| break; |
| case CL_INVALID_ARG_VALUE: |
| fprintf(stderr, "Invalid arg value.\n"); |
| break; |
| case CL_INVALID_ARG_SIZE: |
| fprintf(stderr, "Invalid arg size.\n"); |
| break; |
| case CL_INVALID_KERNEL_ARGS: |
| fprintf(stderr, "Invalid kernel args.\n"); |
| break; |
| case CL_INVALID_WORK_DIMENSION: |
| fprintf(stderr, "Invalid work dimension.\n"); |
| break; |
| case CL_INVALID_WORK_GROUP_SIZE: |
| fprintf(stderr, "Invalid work group size.\n"); |
| break; |
| case CL_INVALID_WORK_ITEM_SIZE: |
| fprintf(stderr, "Invalid work item size.\n"); |
| break; |
| case CL_INVALID_GLOBAL_OFFSET: |
| fprintf(stderr, "Invalid global offset.\n"); |
| break; |
| case CL_INVALID_EVENT_WAIT_LIST: |
| fprintf(stderr, "Invalid event wait list.\n"); |
| break; |
| case CL_INVALID_EVENT: |
| fprintf(stderr, "Invalid event.\n"); |
| break; |
| case CL_INVALID_OPERATION: |
| fprintf(stderr, "Invalid operation.\n"); |
| break; |
| case CL_INVALID_GL_OBJECT: |
| fprintf(stderr, "Invalid GL object.\n"); |
| break; |
| case CL_INVALID_BUFFER_SIZE: |
| fprintf(stderr, "Invalid buffer size.\n"); |
| break; |
| case CL_INVALID_MIP_LEVEL: |
| fprintf(stderr, "Invalid mip level.\n"); |
| break; |
| case CL_INVALID_GLOBAL_WORK_SIZE: |
| fprintf(stderr, "Invalid global work size.\n"); |
| break; |
| case CL_INVALID_PROPERTY: |
| fprintf(stderr, "Invalid property.\n"); |
| break; |
| case CL_INVALID_IMAGE_DESCRIPTOR: |
| fprintf(stderr, "Invalid image descriptor.\n"); |
| break; |
| case CL_INVALID_COMPILER_OPTIONS: |
| fprintf(stderr, "Invalid compiler options.\n"); |
| break; |
| case CL_INVALID_LINKER_OPTIONS: |
| fprintf(stderr, "Invalid linker options.\n"); |
| break; |
| case CL_INVALID_DEVICE_PARTITION_COUNT: |
| fprintf(stderr, "Invalid device partition count.\n"); |
| break; |
| case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE |
| fprintf(stderr, "Invalid pipe size.\n"); |
| break; |
| case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE |
| fprintf(stderr, "Invalid device queue.\n"); |
| break; |
| |
| // NVIDIA specific error. |
| case -9999: |
| fprintf(stderr, "NVIDIA invalid read or write buffer.\n"); |
| break; |
| |
| default: |
| fprintf(stderr, "Unknown error code!\n"); |
| break; |
| } |
| } |
| |
| #endif /* HAS_LIBOPENCL */ |
| /******************************************************************************/ |
| /* CUDA */ |
| /******************************************************************************/ |
| #ifdef HAS_LIBCUDART |
| |
| struct CUDAContextT { |
| CUcontext Cuda; |
| }; |
| |
| struct CUDAKernelT { |
| CUfunction Cuda; |
| CUmodule CudaModule; |
| const char *BinaryString; |
| }; |
| |
| struct CUDADevicePtrT { |
| CUdeviceptr Cuda; |
| }; |
| |
| /* Dynamic library handles for the CUDA and CUDA runtime library. */ |
| static void *HandleCuda; |
| static void *HandleCudaRT; |
| |
| /* Type-defines of function pointer to CUDA driver APIs. */ |
| typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); |
| static CuMemAllocFcnTy *CuMemAllocFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, |
| unsigned int); |
| static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLaunchKernelFcnTy( |
| CUfunction F, unsigned int GridDimX, unsigned int GridDimY, |
| unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, |
| unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, |
| void **KernelParams, void **Extra); |
| static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); |
| static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t); |
| static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr); |
| static CuMemFreeFcnTy *CuMemFreeFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule); |
| static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr; |
| |
| typedef CUresult CUDAAPI CuProfilerStopFcnTy(); |
| static CuProfilerStopFcnTy *CuProfilerStopFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext); |
| static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr; |
| |
| typedef CUresult CUDAAPI CuInitFcnTy(unsigned int); |
| static CuInitFcnTy *CuInitFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *); |
| static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); |
| static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *); |
| static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); |
| static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, |
| unsigned int, CUjit_option *, |
| void **); |
| static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, |
| const void *Image); |
| static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, |
| const char *); |
| static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice); |
| static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); |
| static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, |
| CUjitInputType Type, void *Data, |
| size_t Size, const char *Name, |
| unsigned int NumOptions, |
| CUjit_option *Options, |
| void **OptionValues); |
| static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, |
| CUjit_option *Options, |
| void **OptionValues, |
| CUlinkState *StateOut); |
| static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, |
| size_t *SizeOut); |
| static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); |
| static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); |
| static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr; |
| |
| /* Type-defines of function pointer ot CUDA runtime APIs. */ |
| typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); |
| static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; |
| |
| static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { |
| char *Err; |
| void *FuncPtr; |
| dlerror(); |
| FuncPtr = dlsym(Handle, FuncName); |
| if ((Err = dlerror()) != 0) { |
| fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err); |
| return 0; |
| } |
| return FuncPtr; |
| } |
| |
| static int initialDeviceAPILibrariesCUDA() { |
| HandleCuda = dlopen("libcuda.so", RTLD_LAZY); |
| if (!HandleCuda) { |
| fprintf(stderr, "Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); |
| if (!HandleCudaRT) { |
| fprintf(stderr, "Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| /* Get function pointer to CUDA Driver APIs. |
| * |
| * Note that compilers conforming to the ISO C standard are required to |
| * generate a warning if a conversion from a void * pointer to a function |
| * pointer is attempted as in the following statements. The warning |
| * of this kind of cast may not be emitted by clang and new versions of gcc |
| * as it is valid on POSIX 2008. For compilers required to generate a warning, |
| * we temporarily disable -Wpedantic, to avoid bloating the output with |
| * unnecessary warnings. |
| * |
| * Reference: |
| * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html |
| */ |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wpedantic" |
| static int initialDeviceAPIsCUDA() { |
| if (initialDeviceAPILibrariesCUDA() == 0) |
| return 0; |
| |
| CuLaunchKernelFcnPtr = |
| (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); |
| |
| CuMemAllocFcnPtr = |
| (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); |
| |
| CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( |
| HandleCuda, "cuMemAllocManaged"); |
| |
| CuMemFreeFcnPtr = |
| (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); |
| |
| CuMemcpyDtoHFcnPtr = |
| (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); |
| |
| CuMemcpyHtoDFcnPtr = |
| (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); |
| |
| CuModuleUnloadFcnPtr = |
| (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); |
| |
| CuProfilerStopFcnPtr = |
| (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop"); |
| |
| CuCtxDestroyFcnPtr = |
| (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); |
| |
| CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); |
| |
| CuDeviceGetCountFcnPtr = |
| (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); |
| |
| CuDeviceGetFcnPtr = |
| (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); |
| |
| CuCtxCreateFcnPtr = |
| (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); |
| |
| CuCtxGetCurrentFcnPtr = |
| (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent"); |
| |
| CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( |
| HandleCuda, "cuModuleLoadDataEx"); |
| |
| CuModuleLoadDataFcnPtr = |
| (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); |
| |
| CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( |
| HandleCuda, "cuModuleGetFunction"); |
| |
| CuDeviceComputeCapabilityFcnPtr = |
| (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( |
| HandleCuda, "cuDeviceComputeCapability"); |
| |
| CuDeviceGetNameFcnPtr = |
| (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); |
| |
| CuLinkAddDataFcnPtr = |
| (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); |
| |
| CuLinkCreateFcnPtr = |
| (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); |
| |
| CuLinkCompleteFcnPtr = |
| (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); |
| |
| CuLinkDestroyFcnPtr = |
| (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); |
| |
| CuCtxSynchronizeFcnPtr = |
| (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize"); |
| |
| /* Get function pointer to CUDA Runtime APIs. */ |
| CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( |
| HandleCudaRT, "cudaThreadSynchronize"); |
| |
| return 1; |
| } |
| #pragma GCC diagnostic pop |
| |
| static PollyGPUContext *initContextCUDA() { |
| dump_function(); |
| PollyGPUContext *Context; |
| CUdevice Device; |
| |
| int Major = 0, Minor = 0, DeviceID = 0; |
| char DeviceName[256]; |
| int DeviceCount = 0; |
| |
| static __thread PollyGPUContext *CurrentContext = NULL; |
| |
| if (CurrentContext) |
| return CurrentContext; |
| |
| /* Get API handles. */ |
| if (initialDeviceAPIsCUDA() == 0) { |
| fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| if (CuInitFcnPtr(0) != CUDA_SUCCESS) { |
| fprintf(stderr, "Initializing the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| /* Get number of devices that supports CUDA. */ |
| CuDeviceGetCountFcnPtr(&DeviceCount); |
| if (DeviceCount == 0) { |
| fprintf(stderr, "There is no device supporting CUDA.\n"); |
| exit(-1); |
| } |
| |
| CuDeviceGetFcnPtr(&Device, 0); |
| |
| /* Get compute capabilities and the device name. */ |
| CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device); |
| CuDeviceGetNameFcnPtr(DeviceName, 256, Device); |
| debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); |
| |
| /* Create context on the device. */ |
| Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); |
| if (Context == 0) { |
| fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); |
| exit(-1); |
| } |
| Context->Context = malloc(sizeof(CUDAContext)); |
| if (Context->Context == 0) { |
| fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); |
| exit(-1); |
| } |
| |
| // In cases where managed memory is used, it is quite likely that |
| // `cudaMallocManaged` / `polly_mallocManaged` was called before |
| // `polly_initContext` was called. |
| // |
| // If `polly_initContext` calls `CuCtxCreate` when there already was a |
| // pre-existing context created by the runtime API, this causes code running |
| // on P100 to hang. So, we query for a pre-existing context to try and use. |
| // If there is no pre-existing context, we create a new context |
| |
| // The possible pre-existing context from previous runtime API calls. |
| CUcontext MaybeRuntimeAPIContext; |
| if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) { |
| fprintf(stderr, "cuCtxGetCurrent failed.\n"); |
| exit(-1); |
| } |
| |
| // There was no previous context, initialise it. |
| if (MaybeRuntimeAPIContext == NULL) { |
| if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, |
| Device) != CUDA_SUCCESS) { |
| fprintf(stderr, "cuCtxCreateFcnPtr failed.\n"); |
| exit(-1); |
| } |
| } else { |
| ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext; |
| } |
| |
| if (CacheMode) |
| CurrentContext = Context; |
| |
| return Context; |
| } |
| |
| static void freeKernelCUDA(PollyGPUFunction *Kernel) { |
| dump_function(); |
| |
| if (CacheMode) |
| return; |
| |
| if (((CUDAKernel *)Kernel->Kernel)->CudaModule) |
| CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); |
| |
| if (Kernel->Kernel) |
| free((CUDAKernel *)Kernel->Kernel); |
| |
| if (Kernel) |
| free(Kernel); |
| } |
| |
| static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, |
| const char *KernelName) { |
| dump_function(); |
| |
| static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; |
| static __thread int NextCacheItem = 0; |
| |
| for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { |
| // We exploit here the property that all Polly-ACC kernels are allocated |
| // as global constants, hence a pointer comparision is sufficient to |
| // determin equality. |
| if (KernelCache[i] && |
| ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { |
| debug_print(" -> using cached kernel\n"); |
| return KernelCache[i]; |
| } |
| } |
| |
| PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); |
| if (Function == 0) { |
| fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); |
| exit(-1); |
| } |
| Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); |
| if (Function->Kernel == 0) { |
| fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n"); |
| exit(-1); |
| } |
| |
| CUresult Res; |
| CUlinkState LState; |
| CUjit_option Options[6]; |
| void *OptionVals[6]; |
| float Walltime = 0; |
| unsigned long LogSize = 8192; |
| char ErrorLog[8192], InfoLog[8192]; |
| void *CuOut; |
| size_t OutSize; |
| |
| // Setup linker options |
| // Return walltime from JIT compilation |
| Options[0] = CU_JIT_WALL_TIME; |
| OptionVals[0] = (void *)&Walltime; |
| // Pass a buffer for info messages |
| Options[1] = CU_JIT_INFO_LOG_BUFFER; |
| OptionVals[1] = (void *)InfoLog; |
| // Pass the size of the info buffer |
| Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; |
| OptionVals[2] = (void *)LogSize; |
| // Pass a buffer for error message |
| Options[3] = CU_JIT_ERROR_LOG_BUFFER; |
| OptionVals[3] = (void *)ErrorLog; |
| // Pass the size of the error buffer |
| Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; |
| OptionVals[4] = (void *)LogSize; |
| // Make the linker verbose |
| Options[5] = CU_JIT_LOG_VERBOSE; |
| OptionVals[5] = (void *)1; |
| |
| memset(ErrorLog, 0, sizeof(ErrorLog)); |
| |
| CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); |
| Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, |
| strlen(BinaryBuffer) + 1, 0, 0, 0, 0); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); |
| exit(-1); |
| } |
| |
| Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "Complete ptx linker step failed.\n"); |
| fprintf(stderr, "\n%s\n", ErrorLog); |
| exit(-1); |
| } |
| |
| debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, |
| InfoLog); |
| |
| Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), |
| CuOut); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "Loading ptx assembly text failed.\n"); |
| exit(-1); |
| } |
| |
| Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), |
| ((CUDAKernel *)Function->Kernel)->CudaModule, |
| KernelName); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "Loading kernel function failed.\n"); |
| exit(-1); |
| } |
| |
| CuLinkDestroyFcnPtr(LState); |
| |
| ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; |
| |
| if (CacheMode) { |
| if (KernelCache[NextCacheItem]) |
| freeKernelCUDA(KernelCache[NextCacheItem]); |
| |
| KernelCache[NextCacheItem] = Function; |
| |
| NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; |
| } |
| |
| return Function; |
| } |
| |
| static void synchronizeDeviceCUDA() { |
| dump_function(); |
| if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { |
| fprintf(stderr, "Synchronizing device and host memory failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, |
| long MemSize) { |
| dump_function(); |
| |
| CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; |
| CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); |
| } |
| |
| static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, |
| long MemSize) { |
| dump_function(); |
| |
| if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, |
| MemSize) != CUDA_SUCCESS) { |
| fprintf(stderr, "Copying results from device to host memory failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, |
| unsigned int GridDimY, unsigned int BlockDimX, |
| unsigned int BlockDimY, unsigned int BlockDimZ, |
| void **Parameters) { |
| dump_function(); |
| |
| unsigned GridDimZ = 1; |
| unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; |
| CUstream Stream = 0; |
| void **Extra = 0; |
| |
| CUresult Res; |
| Res = |
| CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, |
| GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, |
| SharedMemBytes, Stream, Parameters, Extra); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "Launching CUDA kernel failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| // Maximum number of managed memory pointers. |
| #define DEFAULT_MAX_POINTERS 4000 |
| // For the rationale behing a list of free pointers, see `polly_freeManaged`. |
| void **g_managedptrs; |
| unsigned long long g_nmanagedptrs = 0; |
| unsigned long long g_maxmanagedptrs = 0; |
| |
| __attribute__((constructor)) static void initManagedPtrsBuffer() { |
| g_maxmanagedptrs = DEFAULT_MAX_POINTERS; |
| const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS"); |
| if (maxManagedPointersString) |
| g_maxmanagedptrs = atoll(maxManagedPointersString); |
| |
| g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs); |
| } |
| |
| // Add a pointer as being allocated by cuMallocManaged |
| void addManagedPtr(void *mem) { |
| assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!"); |
| assert(g_nmanagedptrs < g_maxmanagedptrs && |
| "We have hit the maximum number of " |
| "managed pointers allowed. Set the " |
| "POLLY_MAX_MANAGED_POINTERS environment variable. "); |
| g_managedptrs[g_nmanagedptrs++] = mem; |
| } |
| |
| int isManagedPtr(void *mem) { |
| for (unsigned long long i = 0; i < g_nmanagedptrs; i++) { |
| if (g_managedptrs[i] == mem) |
| return 1; |
| } |
| return 0; |
| } |
| |
| void freeManagedCUDA(void *mem) { |
| dump_function(); |
| |
| // In a real-world program this was used (COSMO), there were more `free` |
| // calls in the original source than `malloc` calls. Hence, replacing all |
| // `free`s with `cudaFree` does not work, since we would try to free |
| // 'illegal' memory. |
| // As a quick fix, we keep a free list and check if `mem` is a managed memory |
| // pointer. If it is, we call `cudaFree`. |
| // If not, we pass it along to the underlying allocator. |
| // This is a hack, and can be removed if the underlying issue is fixed. |
| if (isManagedPtr(mem)) { |
| if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { |
| fprintf(stderr, "cudaFree failed.\n"); |
| exit(-1); |
| } |
| return; |
| } else { |
| free(mem); |
| } |
| } |
| |
| void *mallocManagedCUDA(size_t size) { |
| // Note: [Size 0 allocations] |
| // Sometimes, some runtime computation of size could create a size of 0 |
| // for an allocation. In these cases, we do not wish to fail. |
| // The CUDA API fails on size 0 allocations. |
| // So, we allocate size a minimum of size 1. |
| if (!size && DebugMode) |
| fprintf(stderr, "cudaMallocManaged called with size 0. " |
| "Promoting to size 1"); |
| size = max(size, 1); |
| PollyGPUContext *_ = polly_initContextCUDA(); |
| assert(_ && "polly_initContextCUDA failed"); |
| |
| void *newMemPtr; |
| const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, |
| CU_MEM_ATTACH_GLOBAL); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); |
| exit(-1); |
| } |
| addManagedPtr(newMemPtr); |
| return newMemPtr; |
| } |
| |
| static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; |
| CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); |
| free(DevPtr); |
| free(Allocation); |
| } |
| |
| static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { |
| if (!MemSize && DebugMode) |
| fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. " |
| "Promoting to size 1"); |
| // see: [Size 0 allocations] |
| MemSize = max(MemSize, 1); |
| dump_function(); |
| |
| PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); |
| if (DevData == 0) { |
| fprintf(stderr, |
| "Allocate memory for GPU device memory pointer failed." |
| " Line: %d | Size: %ld\n", |
| __LINE__, MemSize); |
| exit(-1); |
| } |
| DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); |
| if (DevData->DevicePtr == 0) { |
| fprintf(stderr, |
| "Allocate memory for GPU device memory pointer failed." |
| " Line: %d | Size: %ld\n", |
| __LINE__, MemSize); |
| exit(-1); |
| } |
| |
| CUresult Res = |
| CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); |
| |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stderr, |
| "Allocate memory for GPU device memory pointer failed." |
| " Line: %d | Size: %ld\n", |
| __LINE__, MemSize); |
| exit(-1); |
| } |
| |
| return DevData; |
| } |
| |
| static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; |
| return (void *)DevPtr->Cuda; |
| } |
| |
| static void freeContextCUDA(PollyGPUContext *Context) { |
| dump_function(); |
| |
| CUDAContext *Ctx = (CUDAContext *)Context->Context; |
| if (Ctx->Cuda) { |
| CuProfilerStopFcnPtr(); |
| CuCtxDestroyFcnPtr(Ctx->Cuda); |
| free(Ctx); |
| free(Context); |
| } |
| |
| dlclose(HandleCuda); |
| dlclose(HandleCudaRT); |
| } |
| |
| #endif /* HAS_LIBCUDART */ |
| /******************************************************************************/ |
| /* API */ |
| /******************************************************************************/ |
| |
| PollyGPUContext *polly_initContext() { |
| DebugMode = getenv("POLLY_DEBUG") != 0; |
| CacheMode = getenv("POLLY_NOCACHE") == 0; |
| |
| dump_function(); |
| |
| PollyGPUContext *Context; |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| Context = initContextCUDA(); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| Context = initContextCL(); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| |
| return Context; |
| } |
| |
| void polly_freeKernel(PollyGPUFunction *Kernel) { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| freeKernelCUDA(Kernel); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| freeKernelCL(Kernel); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, |
| const char *KernelName) { |
| dump_function(); |
| |
| PollyGPUFunction *Function; |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| Function = getKernelCUDA(BinaryBuffer, KernelName); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| Function = getKernelCL(BinaryBuffer, KernelName); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| |
| return Function; |
| } |
| |
| void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, |
| long MemSize) { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| copyFromHostToDeviceCUDA(HostData, DevData, MemSize); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| copyFromHostToDeviceCL(HostData, DevData, MemSize); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, |
| long MemSize) { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| copyFromDeviceToHostCUDA(DevData, HostData, MemSize); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| copyFromDeviceToHostCL(DevData, HostData, MemSize); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, |
| unsigned int GridDimY, unsigned int BlockDimX, |
| unsigned int BlockDimY, unsigned int BlockDimZ, |
| void **Parameters) { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, |
| BlockDimZ, Parameters); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, |
| Parameters); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| freeDeviceMemoryCUDA(Allocation); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| freeDeviceMemoryCL(Allocation); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { |
| dump_function(); |
| |
| PollyGPUDevicePtr *DevData; |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| DevData = allocateMemoryForDeviceCUDA(MemSize); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| DevData = allocateMemoryForDeviceCL(MemSize); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| |
| return DevData; |
| } |
| |
| void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| void *DevPtr; |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| DevPtr = getDevicePtrCUDA(Allocation); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| DevPtr = getDevicePtrCL(Allocation); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| |
| return DevPtr; |
| } |
| |
| void polly_synchronizeDevice() { |
| dump_function(); |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| synchronizeDeviceCUDA(); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| synchronizeDeviceCL(); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| void polly_freeContext(PollyGPUContext *Context) { |
| dump_function(); |
| |
| if (CacheMode) |
| return; |
| |
| switch (Runtime) { |
| #ifdef HAS_LIBCUDART |
| case RUNTIME_CUDA: |
| freeContextCUDA(Context); |
| break; |
| #endif /* HAS_LIBCUDART */ |
| #ifdef HAS_LIBOPENCL |
| case RUNTIME_CL: |
| freeContextCL(Context); |
| break; |
| #endif /* HAS_LIBOPENCL */ |
| default: |
| err_runtime(); |
| } |
| } |
| |
| void polly_freeManaged(void *mem) { |
| dump_function(); |
| |
| #ifdef HAS_LIBCUDART |
| freeManagedCUDA(mem); |
| #else |
| fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n"); |
| exit(-1); |
| #endif |
| } |
| |
| void *polly_mallocManaged(size_t size) { |
| dump_function(); |
| |
| #ifdef HAS_LIBCUDART |
| return mallocManagedCUDA(size); |
| #else |
| fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n"); |
| exit(-1); |
| #endif |
| } |
| |
| /* Initialize GPUJIT with CUDA as runtime library. */ |
| PollyGPUContext *polly_initContextCUDA() { |
| #ifdef HAS_LIBCUDART |
| Runtime = RUNTIME_CUDA; |
| return polly_initContext(); |
| #else |
| fprintf(stderr, "GPU Runtime was built without CUDA support.\n"); |
| exit(-1); |
| #endif /* HAS_LIBCUDART */ |
| } |
| |
| /* Initialize GPUJIT with OpenCL as runtime library. */ |
| PollyGPUContext *polly_initContextCL() { |
| #ifdef HAS_LIBOPENCL |
| Runtime = RUNTIME_CL; |
| return polly_initContext(); |
| #else |
| fprintf(stderr, "GPU Runtime was built without OpenCL support.\n"); |
| exit(-1); |
| #endif /* HAS_LIBOPENCL */ |
| } |