OpenCL¶

OpenCL is a low-level programming framework with direct control where code is executed. Accelerator code, so called kernels, are usually written in OpenCL C, which is based on C99. Host code is typically written using C or C++.

For more information visit the Khronos OpenCL homepage. Recommended reading resources are:

Code sample¶

#include <CL/opencl.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>

const char *incSource =                 "\n" \
"__kernel void inc(                      \n" \
"   __global double* a_d,                \n" \
"   const unsigned int size) {           \n" \
"   int i = get_global_id(0);            \n" \
"   if(i < size)                         \n" \
"       a_d[i]= a_d[i] + 1.0;            \n" \
"}                                       \n";

int main(int arg, char *argv[]) {
    const unsigned int size = 1 << 16;

    cl_int errCode;

    // Obtain the first available platform.
    cl_platform_id platformID = NULL;
    cl_uint numPlatforms;
    errCode = clGetPlatformIDs(1, &platformID, &numPlatforms);
    if (errCode != CL_SUCCESS) {
        errx(1, "clGetPlatformIDs() failed");
    }

    // Obtain the first available device on the platform
    cl_device_id deviceID = NULL;
    cl_uint numDevices;
    errCode = clGetDeviceIDs( platformID, CL_DEVICE_TYPE_DEFAULT, 1,
                              &deviceID, &numDevices);
    if (errCode != CL_SUCCESS) {
        errx(1, "clGetDeviceIDs() failed");
    }

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &deviceID, NULL, NULL, &errCode);
    if (errCode != CL_SUCCESS) {
        errx(1, "clCreateContext() failed");
    }

    // Create a command queue
    cl_command_queue commandQueue = clCreateCommandQueue(context, deviceID, 0, &errCode);
    if (errCode != CL_SUCCESS) {
        errx(1, "clCreateCommandQueue() failed");
    }

    printf("* Allocate memory on the host\n");
    double *a = (double *) malloc(size * sizeof(double));
    if (a == NULL) {
        errx(1, "malloc a[] failed");
    }

    printf("* Allocate memory on the device\n");
    cl_mem a_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &errCode);
    if (errCode != CL_SUCCESS) {
        errx(1, "clCreateBuffer() failed");
    }

    printf("* Preprocess / initialize data on the host\n");
    printf("  e.g. read data from storage\n");
    for (int i = 0; i < size; i++) {
        a[i] = 0.;
    }

    printf("* Copy data from the host to the device\n");
    errCode = clEnqueueWriteBuffer(commandQueue, a_d, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
    if (errCode != CL_SUCCESS) {
        errx(1, "clEnqueueWriteBuffer() failed");
    }

    //
    printf("* Compute on the device\n");
    //

    // Creates a program object for a context, and loads source code specified by text strings into the program object
    cl_program program = clCreateProgramWithSource(context, 1, & incSource, NULL, &errCode);
    if (errCode != CL_SUCCESS) {
       errx(1, "clCreateProgramWithSource() failed");
    }

    // Builds (compiles and links) a program executable from the program source
    errCode = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
    if (errCode != CL_SUCCESS) {
        size_t len;
        char buffer[2048];
        clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        errx(1, "clBuildProgram() failed:\n%s", buffer);
    }

    // Creates a kernel object
    cl_kernel kernel = clCreateKernel(program, "inc", &errCode);
    if (errCode != CL_SUCCESS) {
        errx(1, "clCreateKernel() failed");
    }

    // Set the argument value for a specific argument of a kernel
    errCode = clSetKernelArg(kernel, 0, sizeof(cl_mem)      , &a_d);
    if (errCode != CL_SUCCESS) {
        errx(1, "clSetKernelArg() failed");
    }
    errCode = clSetKernelArg(kernel, 1, sizeof(unsigned int), &size);
    if (errCode != CL_SUCCESS) {
        errx(1, "clSetKernelArg() failed");
    }

    // Query the maximum workgroup size
    size_t local;
    errCode = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (errCode != CL_SUCCESS) {
        errx(1, "clGetKernelWorkGroupInfo() failed");
    }

    // Enqueues a command to execute a kernel on a device
    size_t global = size;
    errCode = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (errCode != CL_SUCCESS) {
        errx(1, "clEnqueueNDRangeKernel() failed");
    }

    // Wait for command completion
    errCode = clFinish(commandQueue);
    if (errCode != CL_SUCCESS) {
         errx(1, "clFinish() failed");
    }

    // Release the kernel object
    errCode = clReleaseKernel(kernel);

    // Release the program object
    errCode = clReleaseProgram(program);

    // Release the device
    errCode = clReleaseDevice(deviceID);

    printf("* Transfer data back from the device to the host\n");
    errCode = clEnqueueReadBuffer(commandQueue, a_d, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
    if (errCode != CL_SUCCESS) {
        errx(1, "clEnqueueReadBuffer() failed");
    }

    printf("* Delete data on the device\n");
    errCode = clReleaseMemObject(a_d);
    if (errCode != CL_SUCCESS) {
        errx(1, "clReleaseMemObject() failed");
    }

    // Release a command queue
    errCode = clReleaseCommandQueue(commandQueue);

    // release the context
    errCode = clReleaseContext(context);

    printf("* Postprocess data on the host\n");
    printf("  e.g. write data to storage\n");
    for (int i = 0; i < size; i++) {
        if (a[i] != 1.) {
            errx(2, "Computation on GPU failed");
        }
    }

    printf("* Free memory on the host\n");
    free(a);

    return 0;
}

Compiling OpenCL code¶

OpenCL programs can be compiled with different compilers. The procedure for this is as follows:

GNU Compiler Collection

## Load GNU compiler and CUDA environment
$ module add \
    compiler/gnu \
    devel/cuda

## Compile C or C++ source code with OpenCL support
$ gcc      ...   ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL
$ g++      ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL

Intel Compiler

## Load Intel compiler and CUDA environment
$ module add \
    compiler/intel \
    devel/cuda

## Compile C or C++ source code with OpenCL support
$ icc   ...   ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL
$ icpc  ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL

LLVM Compiler

# Load LLVM compiler and CUDA environment
$ module add \
    compiler/llvm \
    devel/cuda

## Compile C or C++ source code with OpenCL support
$ clang   ...   ${C_SOURCE} -o ${EXECUTABLE} -lOpenCL
$ clang++ ... ${CXX_SOURCE} -o ${EXECUTABLE} -lOpenCL