Skip to content

Message Passing (MPI)

MPI is a standardized and portable message-passing library for distributed processes. Both point-to-point and collective communication are supported.

There is a wide variety of training materials about using OpenMP on CPUs available, so this documentation focuses on the basics and on the combination of MPI libraries with GPU programming models.

Compilers

The following table lists the MPI compiler commands available on most of the systems run by NHR@KIT and the underlying compilers, compiler families, languages, and application binary interfaces (ABIs) that they support.

MPI Compiler Command Default Compiler Supported Language(s) Supported ABI's
mpicc gcc, cc C 32/64 bit
mpicxx g++ C/C++ 32/64 bit
mpifc gfortran Fortran77/Fortran 95 32/64 bit
GNU mpigcc gcc C 32/64 bit
GNU mpigxx g++ C/C++ 32/64 bit
GNU mpif77 g77 Fortran 77 32/64 bit
GNU mpif90 gfortran Fortran 95 32/64 bit
Intel mpiicc icc C 32/64 bit
Intel mpiicpc icpc C++ 32/64 bit
Intel impiifort ifort Fortran77/Fortran 95 32/64 bit

GPU Offloading and MPI

All MPI communication takes place between the private memory spaces of the distributed processes. Therefore, all data must be copied from the GPU to this private memory area prior to communication. With so-called CUDA aware MPI implementations, this re-copying is not required and transfers can be performed directly from the GPU memory.

See:

The following simple MPI program demonstrates host to host, host to device and device to device MPI point to point communication:

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include "cuda.h"
#include "cuda_runtime.h"

int main(int argc, char* argv[]) {
    // Initialize the MPI execution environment
    MPI_Init(&argc, &argv);

    // Get the size of the group associated with communicator MPI_COMM_WORLD
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the calling process in the communicator MPI_COMM_WORLD
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    printf("* Allocate memory on [%1i],CPU\n", world_rank);
    int size = 1000;
    double *a = (double *) malloc(size * sizeof(double));
    if (a == NULL) {
        errx(1, "malloc a[] failed");
    }
    printf("* Allocate memory [%1i],GPU\n", world_rank);
    double *d_a;
    if (cudaMalloc((void **) &d_a, size * sizeof(double)) != cudaSuccess) {
        errx(1, "cudaMalloc d_a[] failed");
    }

    printf("* Initalize memory on [%1i],CPU\n", world_rank);
    for (int i = 0; i < size; i++) {
        a[i] = (double) world_rank;
    }

    MPI_Status status;
    int err;
    // From [0],CPU to [1],GPU
    if      (world_rank == 0) {
        printf("* Send from [%1i],CPU\n", world_rank);
        err = MPI_Send(  a, size, MPI_DOUBLE, 1, 1, MPI_COMM_WORLD);
    }
    else if (world_rank == 1) {
        printf("* Receive to [%1i],GPU\n", world_rank);
        err = MPI_Recv(d_a, size, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [0],CPU to [1],GPU failed");
    }

    // From [1],GPU to [0],GPU
    if      (world_rank == 1) {
        printf("* Send from [%1i],GPU\n", world_rank);
        err = MPI_Send(d_a, size, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD);
    }
    else if (world_rank == 0) {
        printf("* Receive to [%1i],GPU\n", world_rank);
        err = MPI_Recv(d_a, size, MPI_DOUBLE, 1, 2, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [1],GPU to [0],GPU failed");
    }

    // From [0],GPU to [1],CPU
    if      (world_rank == 0) {
        printf("* Send from [%1i],GPU\n", world_rank);
        err = MPI_Send(d_a, size, MPI_DOUBLE, 1, 3, MPI_COMM_WORLD);
    }
    else if (world_rank == 1) {
        printf("* Receive to [%1i],CPU \n", world_rank);
        err = MPI_Recv(  a, size, MPI_DOUBLE, 0, 3, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [0],GPU to [1],CPU failed");
    }

    // From [1],CPU to [0],CPU
    if      (world_rank == 1) {
        printf("* Send from [%1i],CPU\n", world_rank);
        err = MPI_Send(  a, size, MPI_DOUBLE, 0, 4, MPI_COMM_WORLD);
    }
    else if (world_rank == 0) {
        printf("* Receive to [%1i],CPU\n", world_rank);
        err = MPI_Recv(  a, size, MPI_DOUBLE, 1, 4, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [1],CPU to [0],CPU failed");
    }

    // Check host memory
    for (int i = 0; i < size; i++) {
        if (a[i] != 0.) {
            errx(2, "MPI transport failed");
        }
    }

    printf("* Free memory on [%1i],GPU\n", world_rank);
    cudaFree(d_a);

    printf("* Free memory on [%1i],CPU\n", world_rank);
    free(a);

    // Terminates MPI execution environment
    MPI_Finalize();
}

MPI programs can be compiled with different compilers. The procedure for this is as follows:

  • GNU Compiler Collection, OpenMPI

    ## Load GNU compiler, OpenMPI and CUDA environment
    $ module add \
        compiler/gnu \
        mpi/openmpi \
        devel/cuda
    
    ## Compile C or C++ source code with OpenMP support
    $ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    $ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    
  • LLVM Compiler, OpenMPI

    ## Load LLVM compiler, OpenMPI and CUDA environment
    $ module add \
        compiler/llvm  \
        mpi/openmpi \
        devel/cuda
    
    ## Compile C or C++ source code with OpenMP support
    $ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    $ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    
  • NVIDIA High Performance Computing (HPC) SDK

    ## Load NVIDIA HPC SDK environment
    $ module add \
        toolkit/nvidia-hpc-sdk \
        devel/cuda
    
    
    ## Compile C or C++ source code with OpenMP support
    $ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    $ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
    

The start of an MPI program with accelerator support is handled as usual with mpirun. In the accelerated partition of the HoreKa cluster, 2 CPUs each with 38 cores and 4 GPUs are available per node. To distribute the CPUs evenly to the GPUs, you can proceed as follows:

$ mpirun \
    --display-map \
    --display-allocation \
    --map-by ppr:2:socket:pe=19 \
    --bind-to core \
    bash -c \
        'export CUDA_VISIBLE_DEVICES=${OMPI_COMM_WORLD_LOCAL_RANK};
        ${EXECUTABLE}'

Last update: November 29, 2022