HPC UNITe

This is an old revision of the document!

MPI4PI

TOD

PyTorch

Consider the following simple python test script( “pytorch_test.py”):

import torch
 
def test_pytorch():
    print("PyTorch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
 
    if torch.cuda.is_available():
        print("CUDA device:", torch.cuda.get_device_name(0))
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
 
    # Simple tensor operation
    x = torch.tensor([1.0, 2.0, 3.0], device=device)
    y = torch.tensor([4.0, 5.0, 6.0], device=device)
    z = x + y
    print("Tensor operation result:", z)
 
test_pytorch()

To test it on the unite cluster you can use the folling sbatch scrpit to run it:

#!/bin/bash
#SBATCH --job-name=pytorch_test
#SBATCH --output=pytorch_test.out
#SBATCH --error=pytorch_test.err
#SBATCH --time=00:10:00
#SBATCH --partition=a40
#SBATCH --gres=gpu:1
#SBATCH --mem=4G
#SBATCH --cpus-per-task=2
 
# Load necessary modules (modify based on your system)
module load python/pytorch-2.5.1-llvm-cuda-12.3-python-3.13.1-llvm
 
# Activate your virtual environment if needed
# source ~/your_env/bin/activate
 
# Run the PyTorch script
python3.13 pytorch_test.py

Pandas

Consider the following simple python test script( “pandas_test.py”):

import pandas as pd
import numpy as np
 
# Create a simple DataFrame
data = {
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
 
# Test basic operations
print("\nSum of each column:")
print(df.sum())
 
print("\nMean of each column:")
print(df.mean())
 
# Adding a new column
df['D'] = df['A'] + df['B']
print("\nDataFrame after adding new column D (A + B):")
print(df)
 
# Filtering rows
filtered_df = df[df['A'] > 2]
print("\nFiltered DataFrame (A > 2):")
print(filtered_df)
 
# Check if NaN values exist
print("\nCheck for NaN values:")
print(df.isna().sum())

You can use the following snatch script to run it:

#!/bin/bash
#SBATCH --job-name=pytorch_test
#SBATCH --output=pytorch_test.out
#SBATCH --error=pytorch_test.err
#SBATCH --time=00:10:00
#SBATCH --partition=a40
#SBATCH --gres=gpu:1
#SBATCH --mem=4G
#SBATCH --cpus-per-task=2
 
# Load necessary modules (modify based on your system)
module load python/3.13.1-llvm
module load python/3.13/pandas/2.2.3
 
# Activate your virtual environment if needed
# source ~/your_env/bin/activate
 
# Run the PyTorch script
python3.13 pandas_test.py

Simple C/C++ program

The following is a simple C/C++ program which performs element-wise addition of 2 vectors. It does not use any dependent libraries:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
 
/*
 * Perform element-wise addition of two vectors
 *
 * Parameters:
 *   a: First input vector
 *   b: Second input vector
 *   result: Output vector (a + b)
 *   size: Number of elements in vectors
 */
void vector_addition(const double *a, const double *b, double *result, size_t size) {
    for (size_t i = 0; i < size; i++) {
        result[i] = a[i] + b[i];
    }
}
 
int main() {
    const size_t size = 10000000;
 
    printf("========================================\n");
    printf("Vector Addition Example in C\n");
    printf("========================================\n");
    printf("Vector size: %zu elements\n", size);
 
    printf("\nAllocating memory...\n");
    double *vector_a = (double *)malloc(size * sizeof(double));
    double *vector_b = (double *)malloc(size * sizeof(double));
    double *result = (double *)malloc(size * sizeof(double));
 
    if (vector_a == NULL || vector_b == NULL || result == NULL) {
        fprintf(stderr, "Error: Memory allocation failed!\n");
        return 1;
    }
 
    printf("Initializing vectors...\n");
    srand(time(NULL));
    for (size_t i = 0; i < size; i++) {
        vector_a[i] = (double)rand() / RAND_MAX;
        vector_b[i] = (double)rand() / RAND_MAX;
    }
 
    printf("Performing vector addition...\n");
    vector_addition(vector_a, vector_b, result, size);
 
    printf("First 5 elements of result:\n");
    for (int i = 0; i < 5; i++) {
        printf("  result[%d] = %.6f\n", i, result[i]);
    }
 
    free(vector_a);
    free(vector_b);
    free(result);
 
    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_%j.out file.

#!/bin/bash
#SBATCH --job-name=vector_sum
#SBATCH --output=vector_sum_%j.out
#SBATCH --error=vector_sum_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:10:00
#SBATCH --partition=unite
 
echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""
 
# Load necessary modules
module load gcc
 
# Compile the program
echo "Compiling vector_sum.c..."
gcc -O3 -march=native -o vector_sum vector_sum.c -lm
 
if [ $? -ne 0 ]; then
    echo "Error: Compilation failed!"
    exit 1
fi
 
echo "Compilation successful!"
echo ""
 
echo "Running vector_sum..."
./vector_sum
 
echo ""
echo "Job finished at: $(date)"

Simple Python program

The following is a simple Python program which performs element-wise addition of 2 vectors. It does not use any dependent libraries:

#!/usr/bin/env python3
import random
import time
 
def vector_addition(a, b):
    """
    Perform element-wise addition of two vectors
 
    Parameters:
        a: First input vector (list)
        b: Second input vector (list)
 
    Returns:
        result: Output vector (a + b)
    """
    return [a[i] + b[i] for i in range(len(a))]
 
 
def main():
    size = 10000000
 
    print("=" * 40)
    print("Vector Addition Example in Python")
    print("=" * 40)
    print(f"Vector size: {size:,} elements")
 
    print("\nAllocating and initializing vectors...")
    random.seed(time.time())
 
    vector_a = [random.random() for _ in range(size)]
    vector_b = [random.random() for _ in range(size)]
 
    print("Performing vector addition...")
    result = vector_addition(vector_a, vector_b)
 
    print("\nFirst 5 elements of result:")
    for i in range(5):
        print(f"  result[{i}] = {result[i]:.6f}")
 
 
if __name__ == "__main__":
    main()

The following is the respective batch script for running the program. You can see the output of the program in the generated vector_sum_%j.out file.

#!/bin/bash
#SBATCH --job-name=vector_sum
#SBATCH --output=vector_sum_%j.out
#SBATCH --error=vector_sum_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:10:00
#SBATCH --partition=unite
 
echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""
 
echo "Python version:"
python3 --version
echo ""
 
echo "Python executable location:"
which python3
echo ""
 
echo "========================================="
echo "Running vector_sum.py"
echo "========================================="
echo ""
 
python3 vector_sum.py
 
echo ""
echo "========================================="
echo "Job finished at: $(date)"
echo "========================================="

Python program with dependencies

The following is a simple Python program which computes the sum of 2 vectors 3 times using NumPy.

#!/usr/bin/env python3
import numpy as np
import time
 
def vector_addition(size=10000000):
    print(f"Initializing vectors of size {size:,}...")
 
    vector_a = np.random.rand(size)
    vector_b = np.random.rand(size)
 
    print("Performing vector addition...")
    result = vector_a + vector_b
 
    return result
 
def main():
    print("=" * 60)
    print("Vector Addition Example using NumPy")
    print("=" * 60)
 
    sizes = [1000000, 10000000, 50000000]
 
    for size in sizes:
        result = vector_addition(size)
 
        print(f"\nVector size: {size:,} elements")
        print(f"First 5 elements of result: {result[:5]}")
        print("-" * 60)
 
if __name__ == "__main__":
    main()

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_numpy_%j.out file. The script showcases 3 different ways for managing Python dependencies based on your use case. This is controlled through the PYTHON_ENV_METHOD variable defined in the script. Please read the comments in the script for the configuration of the environment which you need to do on the login node. The dependency in the current example is NumPy but the approach for dependency management is generic.

#!/bin/bash
#SBATCH --job-name=vector_sum_numpy
#SBATCH --output=vector_sum_numpy_%j.out
#SBATCH --error=vector_sum_numpy_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:15:00
#SBATCH --partition=unite
 
################################################################################
# CONFIGURATION: Choose your Python environment method
################################################################################
# Options: "venv", "conda", or "module"
PYTHON_ENV_METHOD="venv"
 
VENV_PATH="$HOME/venvs/numpy_env"
CONDA_ENV_NAME="numpy_env"
CONDA_MODULE="anaconda3"
PYTHON_MODULE="python/3.13"
NUMPY_MODULE="python/3.13/numpy/2.2.2"
 
################################################################################
# Setup Instructions (run once on login node before first job submission)
################################################################################
# For venv:
#   python3 -m venv $HOME/venvs/numpy_env
#   source $HOME/venvs/numpy_env/bin/activate
#   pip install numpy
#   deactivate
#
# For conda:
#   module load anaconda3
#   conda create -n numpy_env python=3.9 numpy
#   conda deactivate
#
# For module:
#   Check available modules: module avail python
#   You need to load both python and numpy modules in the script. The numpy module needs to be compatible with the python module.
#   Then you need to modify PYTHON_MODULE and NUMPY_MODULE variables above accordingly.
################################################################################
 
echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""
 
echo "Python environment method: $PYTHON_ENV_METHOD"
echo ""
 
if [ "$PYTHON_ENV_METHOD" = "venv" ]; then
    echo "Activating Python virtual environment..."
    if [ -f "$VENV_PATH/bin/activate" ]; then
        source "$VENV_PATH/bin/activate"
        echo "Virtual environment activated: $VENV_PATH"
    else
        echo "ERROR: Virtual environment not found at $VENV_PATH"
        echo "Please create it first (see setup instructions in script)"
        exit 1
    fi
 
elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then
    echo "Activating Conda environment..."
    module load "$CONDA_MODULE"
    source activate "$CONDA_ENV_NAME"
    echo "Conda environment activated: $CONDA_ENV_NAME"
 
elif [ "$PYTHON_ENV_METHOD" = "module" ]; then
    echo "Loading environment modules..."
    module load "$PYTHON_MODULE"
    module load "$NUMPY_MODULE"
    echo "Modules loaded: $PYTHON_MODULE, $NUMPY_MODULE"
 
else
    echo "ERROR: Invalid PYTHON_ENV_METHOD='$PYTHON_ENV_METHOD'"
    echo "Valid options: venv, conda, module"
    exit 1
fi
 
# Verify Python and NumPy
echo ""
echo "Python3 version:"
python3 --version
 
echo ""
echo "NumPy version:"
python3 -c "import numpy; print(f'NumPy {numpy.__version__}')"
echo ""
echo "Python executable location:"
which python3
 
echo ""
echo "========================================="
echo "Running vector_sum_numpy.py"
echo "========================================="
echo ""
 
python3 vector_sum_numpy.py
 
echo ""
echo "Cleaning up environment..."
 
if [ "$PYTHON_ENV_METHOD" = "venv" ]; then
    deactivate
    echo "Virtual environment deactivated"
elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then
    conda deactivate
    echo "Conda environment deactivated"
elif [ "$PYTHON_ENV_METHOD" = "module" ]; then
    # Modules are automatically unloaded when job ends
    echo "Modules will be unloaded when job completes"
fi
 
echo ""
echo "========================================="
echo "Job finished at: $(date)"
echo "========================================="

C/C++ program with dependencies

The following is a simple C/C++ program which compresses and decompresses a string using zLib.

#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include <stdlib.h>
 
#define CHUNK 16384
 
int main() {
    const char *original = "Hello, this is a test string for zlib compression! "
                          "We'll compress this text and then decompress it to verify it works.";
 
    printf("Original string: %s\n", original);
    printf("Original length: %lu bytes\n\n", strlen(original));
 
    // Compression
    uLong source_len = strlen(original) + 1;
    uLong compressed_len = compressBound(source_len);
    unsigned char *compressed = (unsigned char *)malloc(compressed_len);
 
    if (compress(compressed, &compressed_len, (unsigned char *)original, source_len) != Z_OK) {
        fprintf(stderr, "Compression failed!\n");
        free(compressed);
        return 1;
    }
 
    printf("Compressed length: %lu bytes\n", compressed_len);
    printf("Compression ratio: %.2f%%\n\n", 100.0 * (1.0 - (double)compressed_len / source_len));
 
    // Decompression
    uLong decompressed_len = source_len;
    unsigned char *decompressed = (unsigned char *)malloc(decompressed_len);
 
    if (uncompress(decompressed, &decompressed_len, compressed, compressed_len) != Z_OK) {
        fprintf(stderr, "Decompression failed!\n");
        free(compressed);
        free(decompressed);
        return 1;
    }
 
    printf("Decompressed string: %s\n", decompressed);
    printf("Decompressed length: %lu bytes\n\n", decompressed_len);
 
    if (strcmp(original, (char *)decompressed) == 0) {
        printf("SUCCESS: Original and decompressed strings match!\n");
    } else {
        printf("ERROR: Strings don't match!\n");
    }
 
    free(compressed);
    free(decompressed);
 
    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated zlib_compress_%j.out file. C/C++ dependencies are generally compiled from source. If this is the case you can use the Simple C/C++ program example. The batch script showcases loading the dependent library on the system and linking against it. The cluster uses modules for managing the installed dependencies. Make sure to use compatible compiler and library.

#!/bin/bash
#SBATCH --job-name=zlib_compress
#SBATCH --output=zlib_compress_%j.out
#SBATCH --error=zlib_compress_%j.err
#SBATCH --time=00:05:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --partition=unite
 
module load gcc/latest
module load zlib/1/latest-gcc
 
echo "Loaded modules:"
module list
 
echo ""
echo "Compiling zlib_compress.c..."
gcc -o zlib_compress zlib_compress.c -lz
 
if [ $? -eq 0 ]; then
    echo "Compilation successful!"
    echo ""
    echo "Running the program:"
    echo "===================="
    ./zlib_compress
else
    echo "Compilation failed!"
    exit 1
fi

C++ program which uses MPI

The following is an example C/C++ application which uses MPI to perform element-wise addition of two vectors. Each MPI task computes the addition of its local region and then sends it back to the leader. Using MPI with Python is similar assuming that you know how to manage Python dependencies on the cluster which is described in the previous section. What is important here is to understand how to manage the resources of the system.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
 
#define VECTOR_SIZE 100000
 
int main(int argc, char** argv) {
    int rank, size;
    int i;
 
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
 
    int local_size = VECTOR_SIZE / size;
 
    int *local_a = (int*)malloc(local_size * sizeof(int));
    int *local_b = (int*)malloc(local_size * sizeof(int));
    int *local_c = (int*)malloc(local_size * sizeof(int));
 
    int *a = NULL;
    int *b = NULL;
    int *c = NULL;
 
    if (rank == 0) {
        a = (int*)malloc(VECTOR_SIZE * sizeof(int));
        b = (int*)malloc(VECTOR_SIZE * sizeof(int));
        c = (int*)malloc(VECTOR_SIZE * sizeof(int));
 
        for (i = 0; i < VECTOR_SIZE; i++) {
            a[i] = i + 1;
        }
 
        for (i = 0; i < VECTOR_SIZE; i++) {
            b[i] = (i + 1) * 2;
        }
    }
 
    MPI_Scatter(a, local_size, MPI_INT, local_a, local_size, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(b, local_size, MPI_INT, local_b, local_size, MPI_INT, 0, MPI_COMM_WORLD);
 
    printf("Process %d: Adding %d elements\n", rank, local_size);
    for (i = 0; i < local_size; i++) {
        local_c[i] = local_a[i] + local_b[i];
    }
 
    MPI_Gather(local_c, local_size, MPI_INT, c, local_size, MPI_INT, 0, MPI_COMM_WORLD);
 
    if (rank == 0) {
        printf("\nFirst 5 elements of (A + B): ");
        for (i = 0; i < 5; i++) {
            printf("%d ", c[i]);
        }
        printf("\n");
 
        free(a);
        free(b);
        free(c);
    }
 
    free(local_a);
    free(local_b);
    free(local_c);
 
    MPI_Finalize();
 
    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_mpi_%j.out file. The ntasks parameter of the batch script specifies the number of MPI tasks to be started. This is how you can leverage the resources of the system in order to increase the work done in parallel by your application. The MPI tasks are not guaranteed to be executed on different nodes in the cluster, they can also be on a separate physical cores. What is guaranteed is that every task will have the resources to execute in parallel.

#!/bin/bash
#SBATCH --job-name=vector_sum_mpi
#SBATCH --output=vector_mpi_%j.out
#SBATCH --error=vector_mpi_%j.err
#SBATCH --ntasks=4
#SBATCH --time=00:05:00
#SBATCH --partition=unite
 
module load mpi/latest
 
echo "Compiling vector_sum_mpi.c..."
mpicc -o vector_sum_mpi vector_sum_mpi.c
 
if [ $? -ne 0 ]; then
    echo "Compilation failed!"
    exit 1
fi
 
echo "Compilation successful!"
echo "Running with $SLURM_NTASKS MPI processes..."
echo "----------------------------------------"
 
mpirun -np $SLURM_NTASKS ./vector_sum_mpi
 
echo "----------------------------------------"
echo "Job completed!"
echo "----------------------------------------"

C++ program which uses multiple threads

The following is a simple C++ program which computes the sum of 2 vectors. It uses multiple threads. Each thread computes the sum for its respective region.

#include <iostream>
#include <vector>
#include <thread>
 
#define VECTOR_SIZE 100000
 
void vector_add_worker(int thread_id, int start_idx, int end_idx,
                       const int* a, const int* b, int* c) {
    int elements = end_idx - start_idx;
    std::cout << "Thread " << thread_id << ": Adding " << elements
              << " elements" << std::endl;
 
    for (int i = start_idx; i < end_idx; i++) {
        c[i] = a[i] + b[i];
    }
}
 
int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "Usage: " << argv[0] << " <number_of_threads>" << std::endl;
        return 1;
    }
 
    int num_threads = std::atoi(argv[1]);
    if (num_threads <= 0) {
        std::cerr << "Error: Number of threads must be positive" << std::endl;
        return 1;
    }
 
    std::cout << "Using " << num_threads << " threads" << std::endl;
 
    std::vector<int> a(VECTOR_SIZE);
    std::vector<int> b(VECTOR_SIZE);
    std::vector<int> c(VECTOR_SIZE);
 
    for (int i = 0; i < VECTOR_SIZE; i++) {
        a[i] = i + 1;
        b[i] = (i + 1) * 2;
    }
 
    int elements_per_thread = VECTOR_SIZE / num_threads;
 
    std::vector<std::thread> threads;
    for (unsigned int t = 0; t < num_threads; t++) {
        int start_idx = t * elements_per_thread;
        int end_idx = (t == num_threads - 1) ? VECTOR_SIZE : (t + 1) * elements_per_thread;
 
        threads.emplace_back(vector_add_worker, t, start_idx, end_idx,
                           a.data(), b.data(), c.data());
    }
 
    for (auto& thread : threads) {
        thread.join();
    }
 
    std::cout << "\nFirst 5 elements of (A + B): ";
    for (int i = 0; i < 5; i++) {
        std::cout << c[i] << " ";
    }
    std::cout << std::endl;
 
    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_threads_%j.out file. The cpus-per-task parameter of the batch script specifies the number of cores to be allocated for each task (MPI process). You can combine the use of MPI tasks and threads in order to start one process per node. Then each node can use multiple threads locally to do work in parallel, while the threads share the context of the process.

#!/bin/bash
#SBATCH --job-name=vector_sum_threads
#SBATCH --output=vector_sum_threads_%j.out
#SBATCH --error=vector_sum_threads_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --time=00:05:00
#SBATCH --partition=unite
 
echo "Job started at: $(date)"
echo "Running on node: $(hostname)"
echo "Number of CPUs allocated: $SLURM_CPUS_PER_TASK"
echo "----------------------------------------"
 
module load gcc
 
echo "Compiling vector_sum_threads.cpp..."
g++ -std=c++11 -pthread -O3 vector_sum_threads.cpp -o vector_sum_threads
 
if [ $? -eq 0 ]; then
    echo "Compilation successful!"
    echo "----------------------------------------"
 
    echo "Running vector_sum_threads with $SLURM_CPUS_PER_TASK threads..."
    ./vector_sum_threads $SLURM_CPUS_PER_TASK
 
    echo "----------------------------------------"
    echo "Job finished at: $(date)"
else
    echo "Compilation failed!"
    exit 1
fi

C++ program which uses GPU

The following is an example Cuda application which uses Nvidia GPU to perform element-wise addition of two vectors. Using Cuda with Python is similar assuming that you know how to manage Python dependencies on the cluster which is described in a previous section. What is important here is to understand how to manage the resources of the system.

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <sys/time.h>
 
#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s:%d, %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)
 
/*
 * CUDA kernel for vector addition
 * Each thread computes one element of the result vector
 *
 * Parameters:
 *   a: First input vector
 *   b: Second input vector
 *   c: Output vector (result)
 *   n: Number of elements
 */
__global__ void vectorAddKernel(const float *a, const float *b, float *c, int n) {
    // Calculate global thread ID
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
    // Check if thread is within bounds
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
 
int main() {
    const int N = 50'000'000;
    const size_t bytes = N * sizeof(float);
 
    printf("========================================\n");
    printf("CUDA Vector Addition Example\n");
    printf("========================================\n");
    printf("Vector size: %d elements\n", N);
 
    int deviceId;
    cudaDeviceProp props;
    CUDA_CHECK(cudaGetDevice(&deviceId));
    CUDA_CHECK(cudaGetDeviceProperties(&props, deviceId));
 
    printf("\nGPU Information:\n");
    printf("  Device: %s\n", props.name);
    printf("  Compute Capability: %d.%d\n", props.major, props.minor);
    printf("  Total Global Memory: %.2f GB\n",
           props.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("  Multiprocessors: %d\n", props.multiProcessorCount);
    printf("  Max Threads per Block: %d\n", props.maxThreadsPerBlock);
    printf("  Warp Size: %d\n", props.warpSize);
 
    printf("\nAllocating host memory...\n");
    float *h_a = (float *)malloc(bytes);
    float *h_b = (float *)malloc(bytes);
    float *h_c_gpu = (float *)malloc(bytes);
    float *h_c_cpu = (float *)malloc(bytes);
 
    if (!h_a || !h_b || !h_c_gpu || !h_c_cpu) {
        fprintf(stderr, "Error: Host memory allocation failed!\n");
        return 1;
    }
 
    for (int i = 0; i < N; i++) {
        h_a[i] = (float)rand() / RAND_MAX;
        h_b[i] = (float)rand() / RAND_MAX;
    }
 
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, bytes));
    CUDA_CHECK(cudaMalloc(&d_b, bytes));
    CUDA_CHECK(cudaMalloc(&d_c, bytes));
 
    double transfer_start = getTime();
    CUDA_CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));
 
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 
    printf("\nKernel Configuration:\n");
    printf("  Threads per block: %d\n", threadsPerBlock);
    printf("  Blocks per grid: %d\n", blocksPerGrid);
    printf("  Total threads: %d\n", blocksPerGrid * threadsPerBlock);
 
    vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);
 
    CUDA_CHECK(cudaMemcpy(h_c_gpu, d_c, bytes, cudaMemcpyDeviceToHost));
 
    printf("\nFirst 5 elements of result:\n");
    for (int i = 0; i < 5; i++) {
        printf("  c[%d] = %.6f\n", i, h_c_gpu[i]);
    }
 
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    free(h_a);
    free(h_b);
    free(h_c_gpu);
    free(h_c_cpu);
 
    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_cuda_%j.out file. The gres parameter of the batch script specifies the number of GPUs to be allocated in total. You can combine the use of MPI tasks and GPUs in order to start one process per node. Then each node can use multiple GPUs locally to do work in parallel.

#!/bin/bash
#SBATCH --job-name=vector_sum_cuda
#SBATCH --output=vector_sum_cuda_%j.out
#SBATCH --error=vector_sum_cuda_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH --time=00:10:00
#SBATCH --partition=unite
 
echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU(s): $SLURM_GPUS_ON_NODE"
echo "Starting at: $(date)"
echo ""
 
module load nvidia/cuda/12-latest
 
echo "Compiling vector_sum_cuda_cuda.cu..."
nvcc -O3 -o vector_sum_cuda vector_sum_cuda.cu
 
if [ $? -ne 0 ]; then
    echo "Error: Compilation failed!"
    exit 1
fi
 
echo "Compilation successful!"
echo ""
 
echo "Running vector_sum_cuda..."
./vector_sum_cuda
 
echo ""
echo "Job finished at: $(date)"

Table of Contents