OpenCL¶
OpenCL is a low-level programming framework with direct control where code is executed. Accelerator code, so called kernels, are usually written in OpenCL C, which is based on C99. Host code is typically written using C or C++.
For more information visit the Khronos OpenCL homepage. Recommended reading resources are:
Code sample¶
#include <CL/opencl.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
const char *incSource = "\n" \
"__kernel void inc( \n" \
" __global double* a_d, \n" \
" const unsigned int size) { \n" \
" int i = get_global_id(0); \n" \
" if(i < size) \n" \
" a_d[i]= a_d[i] + 1.0; \n" \
"} \n";
int main(int arg, char *argv[]) {
const unsigned int size = 1 << 16;
cl_int errCode;
// Obtain the first available platform.
cl_platform_id platformID = NULL;
cl_uint numPlatforms;
errCode = clGetPlatformIDs(1, &platformID, &numPlatforms);
if (errCode != CL_SUCCESS) {
errx(1, "clGetPlatformIDs() failed");
}
// Obtain the first available device on the platform
cl_device_id deviceID = NULL;
cl_uint numDevices;
errCode = clGetDeviceIDs( platformID, CL_DEVICE_TYPE_DEFAULT, 1,
&deviceID, &numDevices);
if (errCode != CL_SUCCESS) {
errx(1, "clGetDeviceIDs() failed");
}
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &deviceID, NULL, NULL, &errCode);
if (errCode != CL_SUCCESS) {
errx(1, "clCreateContext() failed");
}
// Create a command queue
cl_command_queue commandQueue = clCreateCommandQueue(context, deviceID, 0, &errCode);
if (errCode != CL_SUCCESS) {
errx(1, "clCreateCommandQueue() failed");
}
printf("* Allocate memory on the host\n");
double *a = (double *) malloc(size * sizeof(double));
if (a == NULL) {
errx(1, "malloc a[] failed");
}
printf("* Allocate memory on the device\n");
cl_mem a_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &errCode);
if (errCode != CL_SUCCESS) {
errx(1, "clCreateBuffer() failed");
}
printf("* Preprocess / initialize data on the host\n");
printf(" e.g. read data from storage\n");
for (int i = 0; i < size; i++) {
a[i] = 0.;
}
printf("* Copy data from the host to the device\n");
errCode = clEnqueueWriteBuffer(commandQueue, a_d, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
if (errCode != CL_SUCCESS) {
errx(1, "clEnqueueWriteBuffer() failed");
}
//
printf("* Compute on the device\n");
//
// Creates a program object for a context, and loads source code specified by text strings into the program object
cl_program program = clCreateProgramWithSource(context, 1, & incSource, NULL, &errCode);
if (errCode != CL_SUCCESS) {
errx(1, "clCreateProgramWithSource() failed");
}
// Builds (compiles and links) a program executable from the program source
errCode = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
if (errCode != CL_SUCCESS) {
size_t len;
char buffer[2048];
clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
errx(1, "clBuildProgram() failed:\n%s", buffer);
}
// Creates a kernel object
cl_kernel kernel = clCreateKernel(program, "inc", &errCode);
if (errCode != CL_SUCCESS) {
errx(1, "clCreateKernel() failed");
}
// Set the argument value for a specific argument of a kernel
errCode = clSetKernelArg(kernel, 0, sizeof(cl_mem) , &a_d);
if (errCode != CL_SUCCESS) {
errx(1, "clSetKernelArg() failed");
}
errCode = clSetKernelArg(kernel, 1, sizeof(unsigned int), &size);
if (errCode != CL_SUCCESS) {
errx(1, "clSetKernelArg() failed");
}
// Query the maximum workgroup size
size_t local;
errCode = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (errCode != CL_SUCCESS) {
errx(1, "clGetKernelWorkGroupInfo() failed");
}
// Enqueues a command to execute a kernel on a device
size_t global = size;
errCode = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (errCode != CL_SUCCESS) {
errx(1, "clEnqueueNDRangeKernel() failed");
}
// Wait for command completion
errCode = clFinish(commandQueue);
if (errCode != CL_SUCCESS) {
errx(1, "clFinish() failed");
}
// Release the kernel object
errCode = clReleaseKernel(kernel);
// Release the program object
errCode = clReleaseProgram(program);
// Release the device
errCode = clReleaseDevice(deviceID);
printf("* Transfer data back from the device to the host\n");
errCode = clEnqueueReadBuffer(commandQueue, a_d, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
if (errCode != CL_SUCCESS) {
errx(1, "clEnqueueReadBuffer() failed");
}
printf("* Delete data on the device\n");
errCode = clReleaseMemObject(a_d);
if (errCode != CL_SUCCESS) {
errx(1, "clReleaseMemObject() failed");
}
// Release a command queue
errCode = clReleaseCommandQueue(commandQueue);
// release the context
errCode = clReleaseContext(context);
printf("* Postprocess data on the host\n");
printf(" e.g. write data to storage\n");
for (int i = 0; i < size; i++) {
if (a[i] != 1.) {
errx(2, "Computation on GPU failed");
}
}
printf("* Free memory on the host\n");
free(a);
return 0;
}
Compiling OpenCL code¶
OpenCL programs can be compiled with different compilers. The procedure for this is as follows:
-
## Load GNU compiler and CUDA environment $ module add \ compiler/gnu \ devel/cuda ## Compile C or C++ source code with OpenCL support $ gcc ... ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL $ g++ ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL
-
## Load Intel compiler and CUDA environment $ module add \ compiler/intel \ devel/cuda ## Compile C or C++ source code with OpenCL support $ icc ... ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL $ icpc ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL
-
# Load LLVM compiler and CUDA environment $ module add \ compiler/llvm \ devel/cuda ## Compile C or C++ source code with OpenCL support $ clang ... ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL $ clang++ ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL
Last update:
April 12, 2022