This is an old revision of the document!
Table of Contents
MPI4PI
TOD
PyTorch
Consider the following simple python test script( “pytorch_test.py”):
import torch def test_pytorch(): print("PyTorch version:", torch.__version__) print("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): print("CUDA device:", torch.cuda.get_device_name(0)) device = torch.device("cuda") else: device = torch.device("cpu") # Simple tensor operation x = torch.tensor([1.0, 2.0, 3.0], device=device) y = torch.tensor([4.0, 5.0, 6.0], device=device) z = x + y print("Tensor operation result:", z) test_pytorch()
To test it on the unite cluster you can use the folling sbatch scrpit to run it:
#!/bin/bash #SBATCH --job-name=pytorch_test #SBATCH --output=pytorch_test.out #SBATCH --error=pytorch_test.err #SBATCH --time=00:10:00 #SBATCH --partition=a40 #SBATCH --gres=gpu:1 #SBATCH --mem=4G #SBATCH --cpus-per-task=2 # Load necessary modules (modify based on your system) module load python/pytorch-2.5.1-llvm-cuda-12.3-python-3.13.1-llvm # Activate your virtual environment if needed # source ~/your_env/bin/activate # Run the PyTorch script python3.13 pytorch_test.py
Pandas
Consider the following simple python test script( “pandas_test.py”):
import pandas as pd import numpy as np # Create a simple DataFrame data = { 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': [9, 10, 11, 12] } df = pd.DataFrame(data) print("Original DataFrame:") print(df) # Test basic operations print("\nSum of each column:") print(df.sum()) print("\nMean of each column:") print(df.mean()) # Adding a new column df['D'] = df['A'] + df['B'] print("\nDataFrame after adding new column D (A + B):") print(df) # Filtering rows filtered_df = df[df['A'] > 2] print("\nFiltered DataFrame (A > 2):") print(filtered_df) # Check if NaN values exist print("\nCheck for NaN values:") print(df.isna().sum())
You can use the following snatch script to run it:
#!/bin/bash #SBATCH --job-name=pytorch_test #SBATCH --output=pytorch_test.out #SBATCH --error=pytorch_test.err #SBATCH --time=00:10:00 #SBATCH --partition=a40 #SBATCH --gres=gpu:1 #SBATCH --mem=4G #SBATCH --cpus-per-task=2 # Load necessary modules (modify based on your system) module load python/3.13.1-llvm module load python/3.13/pandas/2.2.3 # Activate your virtual environment if needed # source ~/your_env/bin/activate # Run the PyTorch script python3.13 pandas_test.py
Simple C/C++ program
The following is a simple C/C++ program which performs element-wise addition of 2 vectors. It does not use any dependent libraries:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> /* * Perform element-wise addition of two vectors * * Parameters: * a: First input vector * b: Second input vector * result: Output vector (a + b) * size: Number of elements in vectors */ void vector_addition(const double *a, const double *b, double *result, size_t size) { for (size_t i = 0; i < size; i++) { result[i] = a[i] + b[i]; } } int main() { const size_t size = 10000000; printf("========================================\n"); printf("Vector Addition Example in C\n"); printf("========================================\n"); printf("Vector size: %zu elements\n", size); printf("\nAllocating memory...\n"); double *vector_a = (double *)malloc(size * sizeof(double)); double *vector_b = (double *)malloc(size * sizeof(double)); double *result = (double *)malloc(size * sizeof(double)); if (vector_a == NULL || vector_b == NULL || result == NULL) { fprintf(stderr, "Error: Memory allocation failed!\n"); return 1; } printf("Initializing vectors...\n"); srand(time(NULL)); for (size_t i = 0; i < size; i++) { vector_a[i] = (double)rand() / RAND_MAX; vector_b[i] = (double)rand() / RAND_MAX; } printf("Performing vector addition...\n"); vector_addition(vector_a, vector_b, result, size); printf("First 5 elements of result:\n"); for (int i = 0; i < 5; i++) { printf(" result[%d] = %.6f\n", i, result[i]); } free(vector_a); free(vector_b); free(result); return 0; }
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_%j.out file.
#!/bin/bash #SBATCH --job-name=vector_sum #SBATCH --output=vector_sum_%j.out #SBATCH --error=vector_sum_%j.err #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --time=00:10:00 #SBATCH --partition=unite echo "=========================================" echo "SLURM Job Information" echo "=========================================" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURM_NODELIST" echo "Starting at: $(date)" echo "" # Load necessary modules module load gcc # Compile the program echo "Compiling vector_sum.c..." gcc -O3 -march=native -o vector_sum vector_sum.c -lm if [ $? -ne 0 ]; then echo "Error: Compilation failed!" exit 1 fi echo "Compilation successful!" echo "" echo "Running vector_sum..." ./vector_sum echo "" echo "Job finished at: $(date)"
Simple Python program
The following is a simple Python program which performs element-wise addition of 2 vectors. It does not use any dependent libraries:
#!/usr/bin/env python3 import random import time def vector_addition(a, b): """ Perform element-wise addition of two vectors Parameters: a: First input vector (list) b: Second input vector (list) Returns: result: Output vector (a + b) """ return [a[i] + b[i] for i in range(len(a))] def main(): size = 10000000 print("=" * 40) print("Vector Addition Example in Python") print("=" * 40) print(f"Vector size: {size:,} elements") print("\nAllocating and initializing vectors...") random.seed(time.time()) vector_a = [random.random() for _ in range(size)] vector_b = [random.random() for _ in range(size)] print("Performing vector addition...") result = vector_addition(vector_a, vector_b) print("\nFirst 5 elements of result:") for i in range(5): print(f" result[{i}] = {result[i]:.6f}") if __name__ == "__main__": main()
The following is the respective batch script for running the program. You can see the output of the program in the generated vector_sum_%j.out file.
#!/bin/bash #SBATCH --job-name=vector_sum #SBATCH --output=vector_sum_%j.out #SBATCH --error=vector_sum_%j.err #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --time=00:10:00 #SBATCH --partition=unite echo "=========================================" echo "SLURM Job Information" echo "=========================================" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURM_NODELIST" echo "Starting at: $(date)" echo "" echo "Python version:" python3 --version echo "" echo "Python executable location:" which python3 echo "" echo "=========================================" echo "Running vector_sum.py" echo "=========================================" echo "" python3 vector_sum.py echo "" echo "=========================================" echo "Job finished at: $(date)" echo "========================================="
Python program with dependencies
The following is a simple Python program which computes the sum of 2 vectors 3 times using NumPy.
#!/usr/bin/env python3 import numpy as np import time def vector_addition(size=10000000): print(f"Initializing vectors of size {size:,}...") vector_a = np.random.rand(size) vector_b = np.random.rand(size) print("Performing vector addition...") result = vector_a + vector_b return result def main(): print("=" * 60) print("Vector Addition Example using NumPy") print("=" * 60) sizes = [1000000, 10000000, 50000000] for size in sizes: result = vector_addition(size) print(f"\nVector size: {size:,} elements") print(f"First 5 elements of result: {result[:5]}") print("-" * 60) if __name__ == "__main__": main()
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_numpy_%j.out file. The script showcases 3 different ways for managing Python dependencies based on your use case. This is controlled through the PYTHON_ENV_METHOD variable defined in the script. Please read the comments in the script for the configuration of the environment which you need to do on the login node. The dependency in the current example is NumPy but the approach for dependency management is generic.
#!/bin/bash #SBATCH --job-name=vector_sum_numpy #SBATCH --output=vector_sum_numpy_%j.out #SBATCH --error=vector_sum_numpy_%j.err #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --time=00:15:00 #SBATCH --partition=unite ################################################################################ # CONFIGURATION: Choose your Python environment method ################################################################################ # Options: "venv", "conda", or "module" PYTHON_ENV_METHOD="venv" VENV_PATH="$HOME/venvs/numpy_env" CONDA_ENV_NAME="numpy_env" CONDA_MODULE="anaconda3" PYTHON_MODULE="python/3.13" NUMPY_MODULE="python/3.13/numpy/2.2.2" ################################################################################ # Setup Instructions (run once on login node before first job submission) ################################################################################ # For venv: # python3 -m venv $HOME/venvs/numpy_env # source $HOME/venvs/numpy_env/bin/activate # pip install numpy # deactivate # # For conda: # module load anaconda3 # conda create -n numpy_env python=3.9 numpy # conda deactivate # # For module: # Check available modules: module avail python # You need to load both python and numpy modules in the script. The numpy module needs to be compatible with the python module. # Then you need to modify PYTHON_MODULE and NUMPY_MODULE variables above accordingly. ################################################################################ echo "=========================================" echo "SLURM Job Information" echo "=========================================" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURM_NODELIST" echo "Starting at: $(date)" echo "" echo "Python environment method: $PYTHON_ENV_METHOD" echo "" if [ "$PYTHON_ENV_METHOD" = "venv" ]; then echo "Activating Python virtual environment..." if [ -f "$VENV_PATH/bin/activate" ]; then source "$VENV_PATH/bin/activate" echo "Virtual environment activated: $VENV_PATH" else echo "ERROR: Virtual environment not found at $VENV_PATH" echo "Please create it first (see setup instructions in script)" exit 1 fi elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then echo "Activating Conda environment..." module load "$CONDA_MODULE" source activate "$CONDA_ENV_NAME" echo "Conda environment activated: $CONDA_ENV_NAME" elif [ "$PYTHON_ENV_METHOD" = "module" ]; then echo "Loading environment modules..." module load "$PYTHON_MODULE" module load "$NUMPY_MODULE" echo "Modules loaded: $PYTHON_MODULE, $NUMPY_MODULE" else echo "ERROR: Invalid PYTHON_ENV_METHOD='$PYTHON_ENV_METHOD'" echo "Valid options: venv, conda, module" exit 1 fi # Verify Python and NumPy echo "" echo "Python3 version:" python3 --version echo "" echo "NumPy version:" python3 -c "import numpy; print(f'NumPy {numpy.__version__}')" echo "" echo "Python executable location:" which python3 echo "" echo "=========================================" echo "Running vector_sum_numpy.py" echo "=========================================" echo "" python3 vector_sum_numpy.py echo "" echo "Cleaning up environment..." if [ "$PYTHON_ENV_METHOD" = "venv" ]; then deactivate echo "Virtual environment deactivated" elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then conda deactivate echo "Conda environment deactivated" elif [ "$PYTHON_ENV_METHOD" = "module" ]; then # Modules are automatically unloaded when job ends echo "Modules will be unloaded when job completes" fi echo "" echo "=========================================" echo "Job finished at: $(date)" echo "========================================="
C/C++ program with dependencies
The following is a simple C/C++ program which compresses and decompresses a string using zLib.
#include <stdio.h> #include <string.h> #include <zlib.h> #include <stdlib.h> #define CHUNK 16384 int main() { const char *original = "Hello, this is a test string for zlib compression! " "We'll compress this text and then decompress it to verify it works."; printf("Original string: %s\n", original); printf("Original length: %lu bytes\n\n", strlen(original)); // Compression uLong source_len = strlen(original) + 1; uLong compressed_len = compressBound(source_len); unsigned char *compressed = (unsigned char *)malloc(compressed_len); if (compress(compressed, &compressed_len, (unsigned char *)original, source_len) != Z_OK) { fprintf(stderr, "Compression failed!\n"); free(compressed); return 1; } printf("Compressed length: %lu bytes\n", compressed_len); printf("Compression ratio: %.2f%%\n\n", 100.0 * (1.0 - (double)compressed_len / source_len)); // Decompression uLong decompressed_len = source_len; unsigned char *decompressed = (unsigned char *)malloc(decompressed_len); if (uncompress(decompressed, &decompressed_len, compressed, compressed_len) != Z_OK) { fprintf(stderr, "Decompression failed!\n"); free(compressed); free(decompressed); return 1; } printf("Decompressed string: %s\n", decompressed); printf("Decompressed length: %lu bytes\n\n", decompressed_len); if (strcmp(original, (char *)decompressed) == 0) { printf("SUCCESS: Original and decompressed strings match!\n"); } else { printf("ERROR: Strings don't match!\n"); } free(compressed); free(decompressed); return 0; }
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated zlib_compress_%j.out file. C/C++ dependencies are generally compiled from source. If this is the case you can use the Simple C/C++ program example. The batch script showcases loading the dependent library on the system and linking against it. The cluster uses modules for managing the installed dependencies. Make sure to use compatible compiler and library.
#!/bin/bash #SBATCH --job-name=zlib_compress #SBATCH --output=zlib_compress_%j.out #SBATCH --error=zlib_compress_%j.err #SBATCH --time=00:05:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --partition=unite module load gcc/latest module load zlib/1/latest-gcc echo "Loaded modules:" module list echo "" echo "Compiling zlib_compress.c..." gcc -o zlib_compress zlib_compress.c -lz if [ $? -eq 0 ]; then echo "Compilation successful!" echo "" echo "Running the program:" echo "====================" ./zlib_compress else echo "Compilation failed!" exit 1 fi
C++ program which uses MPI
The following is an example C/C++ application which uses MPI to perform element-wise addition of two vectors. Each MPI task computes the addition of its local region and then sends it back to the leader. Using MPI with Python is similar assuming that you know how to manage Python dependencies on the cluster which is described in the previous section. What is important here is to understand how to manage the resources of the system.
#include <stdio.h> #include <stdlib.h> #include <mpi.h> #define VECTOR_SIZE 100000 int main(int argc, char** argv) { int rank, size; int i; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); int local_size = VECTOR_SIZE / size; int *local_a = (int*)malloc(local_size * sizeof(int)); int *local_b = (int*)malloc(local_size * sizeof(int)); int *local_c = (int*)malloc(local_size * sizeof(int)); int *a = NULL; int *b = NULL; int *c = NULL; if (rank == 0) { a = (int*)malloc(VECTOR_SIZE * sizeof(int)); b = (int*)malloc(VECTOR_SIZE * sizeof(int)); c = (int*)malloc(VECTOR_SIZE * sizeof(int)); for (i = 0; i < VECTOR_SIZE; i++) { a[i] = i + 1; } for (i = 0; i < VECTOR_SIZE; i++) { b[i] = (i + 1) * 2; } } MPI_Scatter(a, local_size, MPI_INT, local_a, local_size, MPI_INT, 0, MPI_COMM_WORLD); MPI_Scatter(b, local_size, MPI_INT, local_b, local_size, MPI_INT, 0, MPI_COMM_WORLD); printf("Process %d: Adding %d elements\n", rank, local_size); for (i = 0; i < local_size; i++) { local_c[i] = local_a[i] + local_b[i]; } MPI_Gather(local_c, local_size, MPI_INT, c, local_size, MPI_INT, 0, MPI_COMM_WORLD); if (rank == 0) { printf("\nFirst 5 elements of (A + B): "); for (i = 0; i < 5; i++) { printf("%d ", c[i]); } printf("\n"); free(a); free(b); free(c); } free(local_a); free(local_b); free(local_c); MPI_Finalize(); return 0; }
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_mpi_%j.out file. The ntasks parameter of the batch script specifies the number of MPI tasks to be started. This is how you can leverage the resources of the system in order to increase the work done in parallel by your application. The MPI tasks are not guaranteed to be executed on different nodes in the cluster, they can also be on a separate physical cores. What is guaranteed is that every task will have the resources to execute in parallel.
#!/bin/bash #SBATCH --job-name=vector_sum_mpi #SBATCH --output=vector_mpi_%j.out #SBATCH --error=vector_mpi_%j.err #SBATCH --ntasks=4 #SBATCH --time=00:05:00 #SBATCH --partition=unite module load mpi/latest echo "Compiling vector_sum_mpi.c..." mpicc -o vector_sum_mpi vector_sum_mpi.c if [ $? -ne 0 ]; then echo "Compilation failed!" exit 1 fi echo "Compilation successful!" echo "Running with $SLURM_NTASKS MPI processes..." echo "----------------------------------------" mpirun -np $SLURM_NTASKS ./vector_sum_mpi echo "----------------------------------------" echo "Job completed!" echo "----------------------------------------"
C++ program which uses multiple threads
The following is a simple C++ program which computes the sum of 2 vectors. It uses multiple threads. Each thread computes the sum for its respective region.
#include <iostream> #include <vector> #include <thread> #define VECTOR_SIZE 100000 void vector_add_worker(int thread_id, int start_idx, int end_idx, const int* a, const int* b, int* c) { int elements = end_idx - start_idx; std::cout << "Thread " << thread_id << ": Adding " << elements << " elements" << std::endl; for (int i = start_idx; i < end_idx; i++) { c[i] = a[i] + b[i]; } } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "Usage: " << argv[0] << " <number_of_threads>" << std::endl; return 1; } int num_threads = std::atoi(argv[1]); if (num_threads <= 0) { std::cerr << "Error: Number of threads must be positive" << std::endl; return 1; } std::cout << "Using " << num_threads << " threads" << std::endl; std::vector<int> a(VECTOR_SIZE); std::vector<int> b(VECTOR_SIZE); std::vector<int> c(VECTOR_SIZE); for (int i = 0; i < VECTOR_SIZE; i++) { a[i] = i + 1; b[i] = (i + 1) * 2; } int elements_per_thread = VECTOR_SIZE / num_threads; std::vector<std::thread> threads; for (unsigned int t = 0; t < num_threads; t++) { int start_idx = t * elements_per_thread; int end_idx = (t == num_threads - 1) ? VECTOR_SIZE : (t + 1) * elements_per_thread; threads.emplace_back(vector_add_worker, t, start_idx, end_idx, a.data(), b.data(), c.data()); } for (auto& thread : threads) { thread.join(); } std::cout << "\nFirst 5 elements of (A + B): "; for (int i = 0; i < 5; i++) { std::cout << c[i] << " "; } std::cout << std::endl; return 0; }
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_threads_%j.out file. The cpus-per-task parameter of the batch script specifies the number of cores to be allocated for each task (MPI process). You can combine the use of MPI tasks and threads in order to start one process per node. Then each node can use multiple threads locally to do work in parallel, while the threads share the context of the process.
#!/bin/bash #SBATCH --job-name=vector_sum_threads #SBATCH --output=vector_sum_threads_%j.out #SBATCH --error=vector_sum_threads_%j.err #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=8 #SBATCH --time=00:05:00 #SBATCH --partition=unite echo "Job started at: $(date)" echo "Running on node: $(hostname)" echo "Number of CPUs allocated: $SLURM_CPUS_PER_TASK" echo "----------------------------------------" module load gcc echo "Compiling vector_sum_threads.cpp..." g++ -std=c++11 -pthread -O3 vector_sum_threads.cpp -o vector_sum_threads if [ $? -eq 0 ]; then echo "Compilation successful!" echo "----------------------------------------" echo "Running vector_sum_threads with $SLURM_CPUS_PER_TASK threads..." ./vector_sum_threads $SLURM_CPUS_PER_TASK echo "----------------------------------------" echo "Job finished at: $(date)" else echo "Compilation failed!" exit 1 fi
C++ program which uses GPU
The following is an example Cuda application which uses Nvidia GPU to perform element-wise addition of two vectors. Using Cuda with Python is similar assuming that you know how to manage Python dependencies on the cluster which is described in a previous section. What is important here is to understand how to manage the resources of the system.
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <sys/time.h> #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA Error: %s:%d, %s\n", __FILE__, __LINE__, \ cudaGetErrorString(error)); \ exit(EXIT_FAILURE); \ } \ } while(0) /* * CUDA kernel for vector addition * Each thread computes one element of the result vector * * Parameters: * a: First input vector * b: Second input vector * c: Output vector (result) * n: Number of elements */ __global__ void vectorAddKernel(const float *a, const float *b, float *c, int n) { // Calculate global thread ID int idx = blockIdx.x * blockDim.x + threadIdx.x; // Check if thread is within bounds if (idx < n) { c[idx] = a[idx] + b[idx]; } } int main() { const int N = 50'000'000; const size_t bytes = N * sizeof(float); printf("========================================\n"); printf("CUDA Vector Addition Example\n"); printf("========================================\n"); printf("Vector size: %d elements\n", N); int deviceId; cudaDeviceProp props; CUDA_CHECK(cudaGetDevice(&deviceId)); CUDA_CHECK(cudaGetDeviceProperties(&props, deviceId)); printf("\nGPU Information:\n"); printf(" Device: %s\n", props.name); printf(" Compute Capability: %d.%d\n", props.major, props.minor); printf(" Total Global Memory: %.2f GB\n", props.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); printf(" Multiprocessors: %d\n", props.multiProcessorCount); printf(" Max Threads per Block: %d\n", props.maxThreadsPerBlock); printf(" Warp Size: %d\n", props.warpSize); printf("\nAllocating host memory...\n"); float *h_a = (float *)malloc(bytes); float *h_b = (float *)malloc(bytes); float *h_c_gpu = (float *)malloc(bytes); float *h_c_cpu = (float *)malloc(bytes); if (!h_a || !h_b || !h_c_gpu || !h_c_cpu) { fprintf(stderr, "Error: Host memory allocation failed!\n"); return 1; } for (int i = 0; i < N; i++) { h_a[i] = (float)rand() / RAND_MAX; h_b[i] = (float)rand() / RAND_MAX; } float *d_a, *d_b, *d_c; CUDA_CHECK(cudaMalloc(&d_a, bytes)); CUDA_CHECK(cudaMalloc(&d_b, bytes)); CUDA_CHECK(cudaMalloc(&d_c, bytes)); double transfer_start = getTime(); CUDA_CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice)); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; printf("\nKernel Configuration:\n"); printf(" Threads per block: %d\n", threadsPerBlock); printf(" Blocks per grid: %d\n", blocksPerGrid); printf(" Total threads: %d\n", blocksPerGrid * threadsPerBlock); vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N); CUDA_CHECK(cudaMemcpy(h_c_gpu, d_c, bytes, cudaMemcpyDeviceToHost)); printf("\nFirst 5 elements of result:\n"); for (int i = 0; i < 5; i++) { printf(" c[%d] = %.6f\n", i, h_c_gpu[i]); } CUDA_CHECK(cudaFree(d_a)); CUDA_CHECK(cudaFree(d_b)); CUDA_CHECK(cudaFree(d_c)); CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); free(h_a); free(h_b); free(h_c_gpu); free(h_c_cpu); return 0; }
The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated vector_sum_cuda_%j.out file. The gres parameter of the batch script specifies the number of GPUs to be allocated in total. You can combine the use of MPI tasks and GPUs in order to start one process per node. Then each node can use multiple GPUs locally to do work in parallel.
#!/bin/bash #SBATCH --job-name=vector_sum_cuda #SBATCH --output=vector_sum_cuda_%j.out #SBATCH --error=vector_sum_cuda_%j.err #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --gres=gpu:1 #SBATCH --time=00:10:00 #SBATCH --partition=unite echo "=========================================" echo "SLURM Job Information" echo "=========================================" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURM_NODELIST" echo "GPU(s): $SLURM_GPUS_ON_NODE" echo "Starting at: $(date)" echo "" module load nvidia/cuda/12-latest echo "Compiling vector_sum_cuda_cuda.cu..." nvcc -O3 -o vector_sum_cuda vector_sum_cuda.cu if [ $? -ne 0 ]; then echo "Error: Compilation failed!" exit 1 fi echo "Compilation successful!" echo "" echo "Running vector_sum_cuda..." ./vector_sum_cuda echo "" echo "Job finished at: $(date)"
