====Description==== This page provides examples on how to use the cluster. There are language specific examples for **C/C++**, and **Python**, which showcase how you can compile and run applications which are written in those languages on the cluster. Additionally, there are examples for how to leverage the different resources of the cluster. These examples are written in **C++**, but the concepts apply to a program written in any language. ---- ====Simple C/C++ program==== The following is a simple **C/C++** program which performs element-wise addition of 2 vectors. It does **not** use any dependent libraries:


#include 
#include 
#include 
#include 

/*
 * Perform element-wise addition of two vectors
 *
 * Parameters:
 *   a: First input vector
 *   b: Second input vector
 *   result: Output vector (a + b)
 *   size: Number of elements in vectors
 */
void vector_addition(const double *a, const double *b, double *result, size_t size) {
    for (size_t i = 0; i < size; i++) {
        result[i] = a[i] + b[i];
    }
}

int main() {
    const size_t size = 10000000;

    printf("========================================\n");
    printf("Vector Addition Example in C\n");
    printf("========================================\n");
    printf("Vector size: %zu elements\n", size);

    printf("\nAllocating memory...\n");
    double *vector_a = (double *)malloc(size * sizeof(double));
    double *vector_b = (double *)malloc(size * sizeof(double));
    double *result = (double *)malloc(size * sizeof(double));

    if (vector_a == NULL || vector_b == NULL || result == NULL) {
        fprintf(stderr, "Error: Memory allocation failed!\n");
        return 1;
    }

    printf("Initializing vectors...\n");
    srand(time(NULL));
    for (size_t i = 0; i < size; i++) {
        vector_a[i] = (double)rand() / RAND_MAX;
        vector_b[i] = (double)rand() / RAND_MAX;
    }

    printf("Performing vector addition...\n");
    vector_addition(vector_a, vector_b, result, size);

    printf("First 5 elements of result:\n");
    for (int i = 0; i < 5; i++) {
        printf("  result[%d] = %.6f\n", i, result[i]);
    }

    free(vector_a);
    free(vector_b);
    free(result);

    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //vector_sum_%j.out// file.


#!/bin/bash
#SBATCH --job-name=vector_sum
#SBATCH --output=vector_sum_%j.out
#SBATCH --error=vector_sum_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:10:00
#SBATCH --partition=unite

echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""

# Load necessary modules
module load gcc

# Compile the program
echo "Compiling vector_sum.c..."
gcc -O3 -march=native -o vector_sum vector_sum.c -lm

if [ $? -ne 0 ]; then
    echo "Error: Compilation failed!"
    exit 1
fi

echo "Compilation successful!"
echo ""

echo "Running vector_sum..."
./vector_sum

echo ""
echo "Job finished at: $(date)"

---- ====Simple Python program==== The following is a simple **Python** program which performs element-wise addition of 2 vectors. It does **not** use any dependent libraries:


#!/usr/bin/env python3
import random
import time

def vector_addition(a, b):
    """
    Perform element-wise addition of two vectors

    Parameters:
        a: First input vector (list)
        b: Second input vector (list)

    Returns:
        result: Output vector (a + b)
    """
    return [a[i] + b[i] for i in range(len(a))]


def main():
    size = 10000000

    print("=" * 40)
    print("Vector Addition Example in Python")
    print("=" * 40)
    print(f"Vector size: {size:,} elements")

    print("\nAllocating and initializing vectors...")
    random.seed(time.time())

    vector_a = [random.random() for _ in range(size)]
    vector_b = [random.random() for _ in range(size)]

    print("Performing vector addition...")
    result = vector_addition(vector_a, vector_b)

    print("\nFirst 5 elements of result:")
    for i in range(5):
        print(f"  result[{i}] = {result[i]:.6f}")


if __name__ == "__main__":
    main()

The following is the respective batch script for running the program. You can see the output of the program in the generated //vector_sum_%j.out// file.


#!/bin/bash
#SBATCH --job-name=vector_sum
#SBATCH --output=vector_sum_%j.out
#SBATCH --error=vector_sum_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:10:00
#SBATCH --partition=unite

echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""

echo "Python version:"
python3 --version
echo ""

echo "Python executable location:"
which python3
echo ""

echo "========================================="
echo "Running vector_sum.py"
echo "========================================="
echo ""

python3 vector_sum.py

echo ""
echo "========================================="
echo "Job finished at: $(date)"
echo "========================================="

---- ====Python program with dependencies==== The following is a simple **Python** program which computes the sum of 2 vectors 3 times using **NumPy**.


#!/usr/bin/env python3
import numpy as np
import time

def vector_addition(size=10000000):
    print(f"Initializing vectors of size {size:,}...")

    vector_a = np.random.rand(size)
    vector_b = np.random.rand(size)

    print("Performing vector addition...")
    result = vector_a + vector_b

    return result

def main():
    print("=" * 60)
    print("Vector Addition Example using NumPy")
    print("=" * 60)

    sizes = [1000000, 10000000, 50000000]

    for size in sizes:
        result = vector_addition(size)

        print(f"\nVector size: {size:,} elements")
        print(f"First 5 elements of result: {result[:5]}")
        print("-" * 60)

if __name__ == "__main__":
    main()

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //vector_sum_numpy_%j.out// file. The script showcases 3 different ways for managing **Python** dependencies based on your use case. This is controlled through the **PYTHON_ENV_METHOD** variable defined in the script. Please read the comments in the script for the configuration of the environment which you need to do on the login node. The dependency in the current example is **NumPy** but the approach for dependency management is generic.


#!/bin/bash
#SBATCH --job-name=vector_sum_numpy
#SBATCH --output=vector_sum_numpy_%j.out
#SBATCH --error=vector_sum_numpy_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:15:00
#SBATCH --partition=unite

################################################################################
# CONFIGURATION: Choose your Python environment method
################################################################################
# Options: "venv", "conda", or "module"
PYTHON_ENV_METHOD="venv"

VENV_PATH="$HOME/venvs/numpy_env"
CONDA_ENV_NAME="numpy_env"
CONDA_MODULE="anaconda3"
PYTHON_MODULE="python/3.13"
NUMPY_MODULE="python/3.13/numpy/2.2.2"

################################################################################
# Setup Instructions (run once on login node before first job submission)
################################################################################
# For venv:
#   python3 -m venv $HOME/venvs/numpy_env
#   source $HOME/venvs/numpy_env/bin/activate
#   pip install numpy
#   deactivate
#
# For conda:
#   module load anaconda3
#   conda create -n numpy_env python=3.9 numpy
#   conda deactivate
#
# For module:
#   Check available modules: module avail python
#   You need to load both python and numpy modules in the script. The numpy module needs to be compatible with the python module.
#   Then you need to modify PYTHON_MODULE and NUMPY_MODULE variables above accordingly.
################################################################################

echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Starting at: $(date)"
echo ""

echo "Python environment method: $PYTHON_ENV_METHOD"
echo ""

if [ "$PYTHON_ENV_METHOD" = "venv" ]; then
    echo "Activating Python virtual environment..."
    if [ -f "$VENV_PATH/bin/activate" ]; then
        source "$VENV_PATH/bin/activate"
        echo "Virtual environment activated: $VENV_PATH"
    else
        echo "ERROR: Virtual environment not found at $VENV_PATH"
        echo "Please create it first (see setup instructions in script)"
        exit 1
    fi

elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then
    echo "Activating Conda environment..."
    module load "$CONDA_MODULE"
    source activate "$CONDA_ENV_NAME"
    echo "Conda environment activated: $CONDA_ENV_NAME"

elif [ "$PYTHON_ENV_METHOD" = "module" ]; then
    echo "Loading environment modules..."
    module load "$PYTHON_MODULE"
    module load "$NUMPY_MODULE"
    echo "Modules loaded: $PYTHON_MODULE, $NUMPY_MODULE"

else
    echo "ERROR: Invalid PYTHON_ENV_METHOD='$PYTHON_ENV_METHOD'"
    echo "Valid options: venv, conda, module"
    exit 1
fi

# Verify Python and NumPy
echo ""
echo "Python3 version:"
python3 --version

echo ""
echo "NumPy version:"
python3 -c "import numpy; print(f'NumPy {numpy.__version__}')"
echo ""
echo "Python executable location:"
which python3

echo ""
echo "========================================="
echo "Running vector_sum_numpy.py"
echo "========================================="
echo ""

python3 vector_sum_numpy.py

echo ""
echo "Cleaning up environment..."

if [ "$PYTHON_ENV_METHOD" = "venv" ]; then
    deactivate
    echo "Virtual environment deactivated"
elif [ "$PYTHON_ENV_METHOD" = "conda" ]; then
    conda deactivate
    echo "Conda environment deactivated"
elif [ "$PYTHON_ENV_METHOD" = "module" ]; then
    # Modules are automatically unloaded when job ends
    echo "Modules will be unloaded when job completes"
fi

echo ""
echo "========================================="
echo "Job finished at: $(date)"
echo "========================================="

---- ====C/C++ program with dependencies==== The following is a simple **C/C++** program which compresses and decompresses a string using **zLib**.


#include 
#include 
#include 
#include 

#define CHUNK 16384

int main() {
    const char *original = "Hello, this is a test string for zlib compression! "
                          "We'll compress this text and then decompress it to verify it works.";

    printf("Original string: %s\n", original);
    printf("Original length: %lu bytes\n\n", strlen(original));

    // Compression
    uLong source_len = strlen(original) + 1;
    uLong compressed_len = compressBound(source_len);
    unsigned char *compressed = (unsigned char *)malloc(compressed_len);

    if (compress(compressed, &compressed_len, (unsigned char *)original, source_len) != Z_OK) {
        fprintf(stderr, "Compression failed!\n");
        free(compressed);
        return 1;
    }

    printf("Compressed length: %lu bytes\n", compressed_len);
    printf("Compression ratio: %.2f%%\n\n", 100.0 * (1.0 - (double)compressed_len / source_len));

    // Decompression
    uLong decompressed_len = source_len;
    unsigned char *decompressed = (unsigned char *)malloc(decompressed_len);

    if (uncompress(decompressed, &decompressed_len, compressed, compressed_len) != Z_OK) {
        fprintf(stderr, "Decompression failed!\n");
        free(compressed);
        free(decompressed);
        return 1;
    }

    printf("Decompressed string: %s\n", decompressed);
    printf("Decompressed length: %lu bytes\n\n", decompressed_len);

    if (strcmp(original, (char *)decompressed) == 0) {
        printf("SUCCESS: Original and decompressed strings match!\n");
    } else {
        printf("ERROR: Strings don't match!\n");
    }

    free(compressed);
    free(decompressed);

    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //zlib_compress_%j.out// file. **C/C++** dependencies are generally compiled from source. If this is the case you can use the **Simple C/C++ program** example. The batch script showcases loading the dependent library on the system and linking against it. The cluster uses modules for managing the installed dependencies. Make sure to use compatible compiler and library.


#!/bin/bash
#SBATCH --job-name=zlib_compress
#SBATCH --output=zlib_compress_%j.out
#SBATCH --error=zlib_compress_%j.err
#SBATCH --time=00:05:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --partition=unite

module load gcc/latest
module load zlib/1/latest-gcc

echo "Loaded modules:"
module list

echo ""
echo "Compiling zlib_compress.c..."
gcc -o zlib_compress zlib_compress.c -lz

if [ $? -eq 0 ]; then
    echo "Compilation successful!"
    echo ""
    echo "Running the program:"
    echo "===================="
    ./zlib_compress
else
    echo "Compilation failed!"
    exit 1
fi

---- ====C++ program which uses MPI==== The following is an example **C/C++** application which uses **MPI** to perform element-wise addition of two vectors. Each **MPI** task computes the addition of its local region and then sends it back to the leader. Using **MPI** with **Python** is similar assuming that you know how to manage **Python** dependencies on the cluster which is described in the previous section. What is important here is to understand how to manage the resources of the system.


#include 
#include 
#include 

#define VECTOR_SIZE 100000

int main(int argc, char** argv) {
    int rank, size;
    int i;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int local_size = VECTOR_SIZE / size;

    int *local_a = (int*)malloc(local_size * sizeof(int));
    int *local_b = (int*)malloc(local_size * sizeof(int));
    int *local_c = (int*)malloc(local_size * sizeof(int));

    int *a = NULL;
    int *b = NULL;
    int *c = NULL;

    if (rank == 0) {
        a = (int*)malloc(VECTOR_SIZE * sizeof(int));
        b = (int*)malloc(VECTOR_SIZE * sizeof(int));
        c = (int*)malloc(VECTOR_SIZE * sizeof(int));

        for (i = 0; i < VECTOR_SIZE; i++) {
            a[i] = i + 1;
        }

        for (i = 0; i < VECTOR_SIZE; i++) {
            b[i] = (i + 1) * 2;
        }
    }

    MPI_Scatter(a, local_size, MPI_INT, local_a, local_size, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(b, local_size, MPI_INT, local_b, local_size, MPI_INT, 0, MPI_COMM_WORLD);

    printf("Process %d: Adding %d elements\n", rank, local_size);
    for (i = 0; i < local_size; i++) {
        local_c[i] = local_a[i] + local_b[i];
    }

    MPI_Gather(local_c, local_size, MPI_INT, c, local_size, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("\nFirst 5 elements of (A + B): ");
        for (i = 0; i < 5; i++) {
            printf("%d ", c[i]);
        }
        printf("\n");

        free(a);
        free(b);
        free(c);
    }

    free(local_a);
    free(local_b);
    free(local_c);

    MPI_Finalize();

    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //vector_sum_mpi_%j.out// file. The //ntasks// parameter of the batch script specifies the number of **MPI** tasks to be started. This is how you can leverage the resources of the system in order to increase the work done in parallel by your application. The **MPI** tasks are not guaranteed to be executed on different nodes in the cluster, they can also be on a separate physical cores. What is guaranteed is that every task will have the resources to execute in parallel.


#!/bin/bash
#SBATCH --job-name=vector_sum_mpi
#SBATCH --output=vector_mpi_%j.out
#SBATCH --error=vector_mpi_%j.err
#SBATCH --ntasks=4
#SBATCH --time=00:05:00
#SBATCH --partition=unite

module load mpi/latest

echo "Compiling vector_sum_mpi.c..."
mpicc -o vector_sum_mpi vector_sum_mpi.c

if [ $? -ne 0 ]; then
    echo "Compilation failed!"
    exit 1
fi

echo "Compilation successful!"
echo "Running with $SLURM_NTASKS MPI processes..."
echo "----------------------------------------"

mpirun -np $SLURM_NTASKS ./vector_sum_mpi

echo "----------------------------------------"
echo "Job completed!"
echo "----------------------------------------"

---- ====C++ program which uses multiple threads==== The following is a simple **C++** program which computes the sum of 2 vectors. It uses multiple **threads**. Each **thread** computes the sum for its respective region.


#include 
#include 
#include 

#define VECTOR_SIZE 100000

void vector_add_worker(int thread_id, int start_idx, int end_idx,
                       const int* a, const int* b, int* c) {
    int elements = end_idx - start_idx;
    std::cout << "Thread " << thread_id << ": Adding " << elements
              << " elements" << std::endl;

    for (int i = start_idx; i < end_idx; i++) {
        c[i] = a[i] + b[i];
    }
}

int main(int argc, char** argv) {
    if (argc != 2) {
        std::cerr << "Usage: " << argv[0] << " " << std::endl;
        return 1;
    }

    int num_threads = std::atoi(argv[1]);
    if (num_threads <= 0) {
        std::cerr << "Error: Number of threads must be positive" << std::endl;
        return 1;
    }

    std::cout << "Using " << num_threads << " threads" << std::endl;

    std::vector a(VECTOR_SIZE);
    std::vector b(VECTOR_SIZE);
    std::vector c(VECTOR_SIZE);

    for (int i = 0; i < VECTOR_SIZE; i++) {
        a[i] = i + 1;
        b[i] = (i + 1) * 2;
    }

    int elements_per_thread = VECTOR_SIZE / num_threads;

    std::vector threads;
    for (unsigned int t = 0; t < num_threads; t++) {
        int start_idx = t * elements_per_thread;
        int end_idx = (t == num_threads - 1) ? VECTOR_SIZE : (t + 1) * elements_per_thread;

        threads.emplace_back(vector_add_worker, t, start_idx, end_idx,
                           a.data(), b.data(), c.data());
    }

    for (auto& thread : threads) {
        thread.join();
    }

    std::cout << "\nFirst 5 elements of (A + B): ";
    for (int i = 0; i < 5; i++) {
        std::cout << c[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //vector_sum_threads_%j.out// file. The //cpus-per-task// parameter of the batch script specifies the number of cores to be allocated for each task (**MPI** process). You can combine the use of **MPI** tasks and **threads** in order to start one process per node. Then each node can use multiple **threads** locally to do work in parallel, while the **threads** share the context of the process.


#!/bin/bash
#SBATCH --job-name=vector_sum_threads
#SBATCH --output=vector_sum_threads_%j.out
#SBATCH --error=vector_sum_threads_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --time=00:05:00
#SBATCH --partition=unite

echo "Job started at: $(date)"
echo "Running on node: $(hostname)"
echo "Number of CPUs allocated: $SLURM_CPUS_PER_TASK"
echo "----------------------------------------"

module load gcc

echo "Compiling vector_sum_threads.cpp..."
g++ -std=c++11 -pthread -O3 vector_sum_threads.cpp -o vector_sum_threads

if [ $? -eq 0 ]; then
    echo "Compilation successful!"
    echo "----------------------------------------"

    echo "Running vector_sum_threads with $SLURM_CPUS_PER_TASK threads..."
    ./vector_sum_threads $SLURM_CPUS_PER_TASK

    echo "----------------------------------------"
    echo "Job finished at: $(date)"
else
    echo "Compilation failed!"
    exit 1
fi

---- ====C++ program which uses GPU==== The following is an example **Cuda** application which uses **Nvidia GPU** to perform element-wise addition of two vectors. Using **Cuda** with **Python** is similar assuming that you know how to manage **Python** dependencies on the cluster which is described in a previous section. What is important here is to understand how to manage the resources of the system.


#include 
#include 
#include 
#include 

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s:%d, %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

/*
 * CUDA kernel for vector addition
 * Each thread computes one element of the result vector
 *
 * Parameters:
 *   a: First input vector
 *   b: Second input vector
 *   c: Output vector (result)
 *   n: Number of elements
 */
__global__ void vectorAddKernel(const float *a, const float *b, float *c, int n) {
    // Calculate global thread ID
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Check if thread is within bounds
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    const int N = 50'000'000;
    const size_t bytes = N * sizeof(float);

    printf("========================================\n");
    printf("CUDA Vector Addition Example\n");
    printf("========================================\n");
    printf("Vector size: %d elements\n", N);

    int deviceId;
    cudaDeviceProp props;
    CUDA_CHECK(cudaGetDevice(&deviceId));
    CUDA_CHECK(cudaGetDeviceProperties(&props, deviceId));

    printf("\nGPU Information:\n");
    printf("  Device: %s\n", props.name);
    printf("  Compute Capability: %d.%d\n", props.major, props.minor);
    printf("  Total Global Memory: %.2f GB\n",
           props.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("  Multiprocessors: %d\n", props.multiProcessorCount);
    printf("  Max Threads per Block: %d\n", props.maxThreadsPerBlock);
    printf("  Warp Size: %d\n", props.warpSize);

    printf("\nAllocating host memory...\n");
    float *h_a = (float *)malloc(bytes);
    float *h_b = (float *)malloc(bytes);
    float *h_c_gpu = (float *)malloc(bytes);
    float *h_c_cpu = (float *)malloc(bytes);

    if (!h_a || !h_b || !h_c_gpu || !h_c_cpu) {
        fprintf(stderr, "Error: Host memory allocation failed!\n");
        return 1;
    }

    for (int i = 0; i < N; i++) {
        h_a[i] = (float)rand() / RAND_MAX;
        h_b[i] = (float)rand() / RAND_MAX;
    }

    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, bytes));
    CUDA_CHECK(cudaMalloc(&d_b, bytes));
    CUDA_CHECK(cudaMalloc(&d_c, bytes));

    CUDA_CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("\nKernel Configuration:\n");
    printf("  Threads per block: %d\n", threadsPerBlock);
    printf("  Blocks per grid: %d\n", blocksPerGrid);
    printf("  Total threads: %d\n", blocksPerGrid * threadsPerBlock);

    vectorAddKernel<<>>(d_a, d_b, d_c, N);

    CUDA_CHECK(cudaMemcpy(h_c_gpu, d_c, bytes, cudaMemcpyDeviceToHost));

    printf("\nFirst 5 elements of result:\n");
    for (int i = 0; i < 5; i++) {
        printf("  c[%d] = %.6f\n", i, h_c_gpu[i]);
    }

    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));
    free(h_a);
    free(h_b);
    free(h_c_gpu);
    free(h_c_cpu);

    return 0;
}

The following is the respective batch script for compiling and running the program. You can see the output of the program in the generated //vector_sum_cuda_%j.out// file. The //gres// parameter of the batch script specifies the number of **GPUs** to be allocated in total. You can combine the use of **MPI** tasks and **GPUs** in order to start one process per node. Then each node can use multiple **GPUs** locally to do work in parallel.


#!/bin/bash
#SBATCH --job-name=vector_sum_cuda
#SBATCH --output=vector_sum_cuda_%j.out
#SBATCH --error=vector_sum_cuda_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH --time=00:10:00
#SBATCH --partition=unite

echo "========================================="
echo "SLURM Job Information"
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU(s): $SLURM_GPUS_ON_NODE"
echo "Starting at: $(date)"
echo ""

module load nvidia/cuda/12-latest

echo "Compiling vector_sum_cuda_cuda.cu..."
nvcc -O3 -o vector_sum_cuda vector_sum_cuda.cu

if [ $? -ne 0 ]; then
    echo "Error: Compilation failed!"
    exit 1
fi

echo "Compilation successful!"
echo ""

echo "Running vector_sum_cuda..."
./vector_sum_cuda

echo ""
echo "Job finished at: $(date)"

----