From 3b5d2bfe5c717d13509a5fd4b1c3807a4738a5fb Mon Sep 17 00:00:00 2001 From: tmadhyastha Date: Mon, 14 Jul 2025 15:37:35 +0000 Subject: [PATCH] adding ray on slurm examples that correspond to google doc --- ray-slurm/README.md | 7 +++ ray-slurm/ray.container.sbatch | 98 +++++++++++++++++++++++++++++++++ ray-slurm/ray.devcluster.sbatch | 85 ++++++++++++++++++++++++++++ ray-slurm/ray.samplejob.sbatch | 66 ++++++++++++++++++++++ ray-slurm/tune_basic_example.py | 58 +++++++++++++++++++ 5 files changed, 314 insertions(+) create mode 100644 ray-slurm/README.md create mode 100644 ray-slurm/ray.container.sbatch create mode 100644 ray-slurm/ray.devcluster.sbatch create mode 100644 ray-slurm/ray.samplejob.sbatch create mode 100644 ray-slurm/tune_basic_example.py diff --git a/ray-slurm/README.md b/ray-slurm/README.md new file mode 100644 index 0000000..d96d813 --- /dev/null +++ b/ray-slurm/README.md @@ -0,0 +1,7 @@ +### Ray on Slurm + +There are three examples of sbatch scripts in this directory that illustrate how to run Ray on Slurm. + +- ray.devcluster.sbatch: example of creating a Ray cluster that you can use interactively, with a Conda environment. +- ray.container.sbatch: example of running Ray in a container +- ray.samplejob.sbatch: example of creating a Ray cluster and running a sample job using a Conda environment. \ No newline at end of file diff --git a/ray-slurm/ray.container.sbatch b/ray-slurm/ray.container.sbatch new file mode 100644 index 0000000..4c1536c --- /dev/null +++ b/ray-slurm/ray.container.sbatch @@ -0,0 +1,98 @@ +#!/bin/bash +#SBATCH --job-name=raytest +#SBATCH --nodes=2 +#SBATCH --exclusive +#SBATCH --tasks-per-node=1 # we will launch one worker task per node +#SBATCH --cpus-per-task=8 # each worker task gets 8 CPUs. Adjust as needed. +#SBATCH --mem-per-cpu=1GB # each cpu gets 1GB of memory. Adjust as needed. +#SBATCH --gpus-per-task=8 # each worker task will use 8 GPUs. Adjust as needed. +#SBATCH --time=1-00:00:00 # specify a time limit of one day + + + +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + + +# Obtain the head node's IP address. We use this so that when we start worker nodes we can connect them to the head node. +# if we detect a space character in the head node IP, we'll +# convert it to an ipv4 address. This step is optional. +if [[ "$head_node_ip" == *" "* ]]; then +IFS=' ' read -ra ADDR <<<"$head_node_ip" +if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} +else + head_node_ip=${ADDR[0]} +fi +echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" +fi +# If multiple users are going to be starting Ray clusters, we need to make +# sure that the clusters do not conflict. +# Please see https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats +# for more information. So, make sure slurm users create a unique set of ports following that pattern. +# Start the head node using one node, one task, and explicitly set the number of CPUs and GPUs on that node. +# Launch this in the background so that we can continue to start worker nodes. +port=6379 +node_manager_port=6700 +object_manager_port=6701 +ray_client_server_port=10001 +redis_shard_ports=6702 +min_worker_port=10002 +max_worker_port=19999 + +ip_head=$head_node_ip:$port +export ip_head +echo "IP Head: $ip_head" + +# Pull the latest Ray image from Docker Hub once, for all nodes. +if [ -f "${HOME}/ray-nightly-py39-gpu.sqsh" ]; then + echo "Ray image already exists, skipping pull." +else + echo "Pulling Ray image from Docker Hub..." + enroot import -o ${HOME}/ray-nightly-py39-gpu.sqsh docker://rayproject/ray:nightly-py39-gpu +fi + + +echo "Starting HEAD at $head_node" +srun --container-image=${HOME}/ray-nightly-py39-gpu.sqsh --container-remap-root --container-mounts=/mnt/home:/mnt/home \ + --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node_ip" \ + --port=$port \ + --node-manager-port=$node_manager_port \ + --object-manager-port=$object_manager_port \ + --ray-client-server-port=$ray_client_server_port \ + --redis-shard-ports=$redis_shard_ports \ + --min-worker-port=$min_worker_port \ + --max-worker-port=$max_worker_port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block & + + +# optional, though may be useful in certain versions of Ray < 1.0. +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --container-image=${HOME}/ray-nightly-py39-gpu.sqsh --container-remap-root --container-mounts=/mnt/home:/mnt/home \ + --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$ip_head" \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block & + sleep 5 +done + +# sleep to keep the job running until any time limit specified using #SBATCH directives is reached. +# This just means that the job will not exit immediately after starting the Ray cluster, so you +# can use it interactively. +sleep inf + +# If instead you want to run a Ray job, do so using the same container. Use --overlap +# so that you have a place to run this job (because the head and worker nodes are already +# on your cluster). + +#srun -J "cont" --overlap --container-image=${HOME}/ray-nightly-py39-gpu.sqsh --container-remap-root --container-mounts=/mnt/home:/mnt/home python3 -u /mnt/home/tmadhyastha/ray/tune_basic_example.py "$SLURM_CPUS_PER_TASK" \ No newline at end of file diff --git a/ray-slurm/ray.devcluster.sbatch b/ray-slurm/ray.devcluster.sbatch new file mode 100644 index 0000000..f90800a --- /dev/null +++ b/ray-slurm/ray.devcluster.sbatch @@ -0,0 +1,85 @@ +#!/bin/bash +#SBATCH --job-name=raytest +#SBATCH --nodes=2 +#SBATCH --exclusive +#SBATCH --tasks-per-node=1 # we will launch one worker task per node +#SBATCH --cpus-per-task=8 # each worker task gets 8 CPUs. Adjust as needed. +#SBATCH --mem-per-cpu=1GB # each cpu gets 1GB of memory. Adjust as needed. +#SBATCH --gpus-per-task=8 # each worker task will use 8 GPUs. Adjust as needed. +#SBATCH --time=1-00:00:00 # specify a time limit of one day + + +# Here we activate a conda environment named "rayenv" to load Ray and its dependencies. + +eval "$(conda shell.bash hook)" +conda activate rayenv +# Getting the node names +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + + +# Obtain the head node's IP address. We use this so that when we start worker nodes we can connect them to the head node. +# if we detect a space character in the head node IP, we'll +# convert it to an ipv4 address. This step is optional. +if [[ "$head_node_ip" == *" "* ]]; then +IFS=' ' read -ra ADDR <<<"$head_node_ip" +if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} +else + head_node_ip=${ADDR[0]} +fi +echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" +fi +# If multiple users are going to be starting Ray clusters, we need to make +# sure that the clusters do not conflict. +# Please see https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats +# for more information. So, make sure slurm users create a unique set of ports following that pattern. +# Start the head node using one node, one task, and explicitly set the number of CPUs and GPUs on that node. +# Launch this in the background so that we can continue to start worker nodes. +port=6379 +node_manager_port=6700 +object_manager_port=6701 +ray_client_server_port=10001 +redis_shard_ports=6702 +min_worker_port=10002 +max_worker_port=19999 + +ip_head=$head_node_ip:$port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node_ip" \ + --port=$port \ + --node-manager-port=$node_manager_port \ + --object-manager-port=$object_manager_port \ + --ray-client-server-port=$ray_client_server_port \ + --redis-shard-ports=$redis_shard_ports \ + --min-worker-port=$min_worker_port \ + --max-worker-port=$max_worker_port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block & + + +# optional, though may be useful in certain versions of Ray < 1.0. +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$ip_head" \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block & + sleep 5 +done + +# sleep to keep the job running until any time limit specified using #SBATCH directives is reached. +# This just means that the job will not exit immediately after starting the Ray cluster, so you +# can use it interactively. +sleep inf diff --git a/ray-slurm/ray.samplejob.sbatch b/ray-slurm/ray.samplejob.sbatch new file mode 100644 index 0000000..e7bd243 --- /dev/null +++ b/ray-slurm/ray.samplejob.sbatch @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=raytest +#SBATCH --nodes=2 +#SBATCH --exclusive +#SBATCH --tasks-per-node=1 # we will launch one worker task per node +#SBATCH --cpus-per-task=8 # each worker task gets 8 CPUs. Adjust as needed. +#SBATCH --mem-per-cpu=1GB # each cpu gets 1GB of memory. Adjust as needed. +#SBATCH --gpus-per-task=8 # each worker task will use 8 GPUs. Adjust as needed. +#SBATCH --time=1-00:00:00 # specify a time limit of one day + + +# Here we activate a conda environment named "rayenv" to load Ray and its dependencies. +# This assumes that you have already created a conda environment named "rayenv" with Ray installed. +eval "$(conda shell.bash hook)" +conda activate rayenv +# Getting the node names +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + +# if we detect a space character in the head node IP, we'll +# convert it to an ipv4 address. This step is optional. +if [[ "$head_node_ip" == *" "* ]]; then +IFS=' ' read -ra ADDR <<<"$head_node_ip" +if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} +else + head_node_ip=${ADDR[0]} +fi +echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" +fi + +port=6379 +ip_head=$head_node_ip:$port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +echo srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block + +srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block & + +# optional, though may be useful in certain versions of Ray < 1.0. +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$ip_head" \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block & + sleep 5 +done + +# ray/doc/source/cluster/doc_code/simple-trainer.py +python -u tune_basic_example.py "$SLURM_CPUS_PER_TASK" + diff --git a/ray-slurm/tune_basic_example.py b/ray-slurm/tune_basic_example.py new file mode 100644 index 0000000..c9b025d --- /dev/null +++ b/ray-slurm/tune_basic_example.py @@ -0,0 +1,58 @@ +"""This example demonstrates basic Ray Tune random search and grid search.""" +import os +import time + +import ray +from ray.air import session +from ray import tune + +def evaluation_fn(step, width, height): + time.sleep(0.1) + return (0.1 + width * step / 100) ** (-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + session.report({"iterations": step, "mean_loss": intermediate_score}) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + args, _ = parser.parse_known_args() + + # Set Ray temp directory to a shared location + os.environ["RAY_TMPDIR"] = "/mnt/home/tmadhyastha/ray_temp" + + ray.init(configure_logging=False) + + # This will do a grid search over the `activation` parameter. This means + # that each of the two values (`relu` and `tanh`) will be sampled once + # for each sample (`num_samples`). We end up with 2 * 50 = 100 samples. + # The `width` and `height` parameters are sampled randomly. + # `steps` is a constant parameter. + + tuner = tune.Tuner( + easy_objective, + tune_config=tune.TuneConfig( + metric="mean_loss", + mode="min", + num_samples=50, + ), + param_space={ + "steps": 10000, + "width": tune.uniform(0, 20), + "height": tune.uniform(-100, 100), + "activation": tune.grid_search(["relu", "tanh"]), + }, + ) + results = tuner.fit() + + print("Best hyperparameters found were: ", results.get_best_result().config)