coreweave · tmadhyastha-cw · Jul 14, 2025
diff --git a/ray-slurm/README.md b/ray-slurm/README.md
@@ -0,0 +1,7 @@
+### Ray on Slurm
+
+There are three examples of sbatch scripts in this directory that illustrate how to run Ray on Slurm.
+
+- ray.devcluster.sbatch: example of creating a Ray cluster that you can use interactively, with a Conda environment.
+- ray.container.sbatch: example of running Ray in a container
+- ray.samplejob.sbatch: example of creating a Ray cluster and running a sample job using a Conda environment. 
diff --git a/ray-slurm/ray.container.sbatch b/ray-slurm/ray.container.sbatch
@@ -0,0 +1,98 @@
+#!/bin/bash
+#SBATCH --job-name=raytest
+#SBATCH --nodes=2
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1 # we will launch one worker task per node
+#SBATCH --cpus-per-task=8  # each worker task gets 8 CPUs. Adjust as needed.
+#SBATCH --mem-per-cpu=1GB  # each cpu gets 1GB of memory. Adjust as needed.
+#SBATCH --gpus-per-task=8  # each worker task will use 8 GPUs. Adjust as needed.
+#SBATCH --time=1-00:00:00 # specify a time limit of one day
+
+
+
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+
+# Obtain the head node's IP address. We use this so that when we start worker nodes we can connect them to the head node.
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+# If multiple users are going to be starting Ray clusters, we need to make
+# sure that the clusters do not conflict. 
+# Please see https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats
+# for more information. So, make sure slurm users create a unique set of ports following that pattern. 
+# Start the head node using one node, one task, and explicitly set the number of CPUs and GPUs on that node.
+# Launch this in the background so that we can continue to start worker nodes.
+port=6379
+node_manager_port=6700
+object_manager_port=6701
+ray_client_server_port=10001
+redis_shard_ports=6702
+min_worker_port=10002
+max_worker_port=19999
+
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+# Pull the latest Ray image from Docker Hub once, for all nodes. 
+if [ -f "${HOME}/ray-nightly-py39-gpu.sqsh" ]; then
+    echo "Ray image already exists, skipping pull."
+else
+    echo "Pulling Ray image from Docker Hub..."
+    enroot import -o ${HOME}/ray-nightly-py39-gpu.sqsh docker://rayproject/ray:nightly-py39-gpu
+fi
+
+
+echo "Starting HEAD at $head_node"
+srun --container-image=${HOME}/ray-nightly-py39-gpu.sqsh  --container-remap-root --container-mounts=/mnt/home:/mnt/home \
+   --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" \
+        --port=$port \
+        --node-manager-port=$node_manager_port \
+        --object-manager-port=$object_manager_port \
+        --ray-client-server-port=$ray_client_server_port \
+        --redis-shard-ports=$redis_shard_ports \
+        --min-worker-port=$min_worker_port \
+        --max-worker-port=$max_worker_port \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block &
+
+
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --container-image=${HOME}/ray-nightly-py39-gpu.sqsh  --container-remap-root --container-mounts=/mnt/home:/mnt/home \
+        --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$ip_head" \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block &
+    sleep 5
+done
+
+# sleep to keep the job running until any time limit specified using #SBATCH directives is reached.
+# This just means that the job will not exit immediately after starting the Ray cluster, so you
+# can use it interactively. 
+sleep inf
+
+# If instead you want to run a Ray job, do so using the same container. Use --overlap
+# so that you have a place to run this job (because the head and worker nodes are already
+# on your cluster).
+
+#srun -J "cont" --overlap  --container-image=${HOME}/ray-nightly-py39-gpu.sqsh   --container-remap-root --container-mounts=/mnt/home:/mnt/home python3 -u /mnt/home/tmadhyastha/ray/tune_basic_example.py "$SLURM_CPUS_PER_TASK"
diff --git a/ray-slurm/ray.devcluster.sbatch b/ray-slurm/ray.devcluster.sbatch
@@ -0,0 +1,85 @@
+#!/bin/bash
+#SBATCH --job-name=raytest
+#SBATCH --nodes=2
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1 # we will launch one worker task per node
+#SBATCH --cpus-per-task=8  # each worker task gets 8 CPUs. Adjust as needed.
+#SBATCH --mem-per-cpu=1GB  # each cpu gets 1GB of memory. Adjust as needed.
+#SBATCH --gpus-per-task=8  # each worker task will use 8 GPUs. Adjust as needed.
+#SBATCH --time=1-00:00:00 # specify a time limit of one day
+
+
+# Here we activate a conda environment named "rayenv" to load Ray and its dependencies.
+
+eval "$(conda shell.bash hook)"
+conda activate rayenv
+# Getting the node names
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+
+# Obtain the head node's IP address. We use this so that when we start worker nodes we can connect them to the head node.
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+# If multiple users are going to be starting Ray clusters, we need to make
+# sure that the clusters do not conflict. 
+# Please see https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats
+# for more information. So, make sure slurm users create a unique set of ports following that pattern. 
+# Start the head node using one node, one task, and explicitly set the number of CPUs and GPUs on that node.
+# Launch this in the background so that we can continue to start worker nodes.
+port=6379
+node_manager_port=6700
+object_manager_port=6701
+ray_client_server_port=10001
+redis_shard_ports=6702
+min_worker_port=10002
+max_worker_port=19999
+
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" \
+        --port=$port \
+        --node-manager-port=$node_manager_port \
+        --object-manager-port=$object_manager_port \
+        --ray-client-server-port=$ray_client_server_port \
+        --redis-shard-ports=$redis_shard_ports \
+        --min-worker-port=$min_worker_port \
+        --max-worker-port=$max_worker_port \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block &
+
+
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$ip_head" \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 8 --block &
+    sleep 5
+done
+
+# sleep to keep the job running until any time limit specified using #SBATCH directives is reached.
+# This just means that the job will not exit immediately after starting the Ray cluster, so you
+# can use it interactively. 
+sleep inf
diff --git a/ray-slurm/ray.samplejob.sbatch b/ray-slurm/ray.samplejob.sbatch
@@ -0,0 +1,66 @@
+#!/bin/bash
+#SBATCH --job-name=raytest
+#SBATCH --nodes=2
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1 # we will launch one worker task per node
+#SBATCH --cpus-per-task=8  # each worker task gets 8 CPUs. Adjust as needed.
+#SBATCH --mem-per-cpu=1GB  # each cpu gets 1GB of memory. Adjust as needed.
+#SBATCH --gpus-per-task=8  # each worker task will use 8 GPUs. Adjust as needed.
+#SBATCH --time=1-00:00:00 # specify a time limit of one day
+
+
+# Here we activate a conda environment named "rayenv" to load Ray and its dependencies.
+# This assumes that you have already created a conda environment named "rayenv" with Ray installed.
+eval "$(conda shell.bash hook)"
+conda activate rayenv
+# Getting the node names
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+echo srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" --port=$port \
+    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block
+
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" --port=$port \
+    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block &
+
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$ip_head" \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 1 --block &
+    sleep 5
+done
+
+# ray/doc/source/cluster/doc_code/simple-trainer.py
+python -u tune_basic_example.py "$SLURM_CPUS_PER_TASK"
+
diff --git a/ray-slurm/tune_basic_example.py b/ray-slurm/tune_basic_example.py
@@ -0,0 +1,58 @@
+"""This example demonstrates basic Ray Tune random search and grid search."""
+import os
+import time
+
+import ray
+from ray.air import session
+from ray import tune
+
+def evaluation_fn(step, width, height):
+    time.sleep(0.1)
+    return (0.1 + width * step / 100) ** (-1) + height * 0.1
+
+
+def easy_objective(config):
+    # Hyperparameters
+    width, height = config["width"], config["height"]
+
+    for step in range(config["steps"]):
+        # Iterative training function - can be any arbitrary training procedure
+        intermediate_score = evaluation_fn(step, width, height)
+        # Feed the score back back to Tune.
+        session.report({"iterations": step, "mean_loss": intermediate_score})
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    args, _ = parser.parse_known_args()
+
+    # Set Ray temp directory to a shared location
+    os.environ["RAY_TMPDIR"] = "/mnt/home/tmadhyastha/ray_temp"
+
+    ray.init(configure_logging=False)
+
+    # This will do a grid search over the `activation` parameter. This means
+    # that each of the two values (`relu` and `tanh`) will be sampled once
+    # for each sample (`num_samples`). We end up with 2 * 50 = 100 samples.
+    # The `width` and `height` parameters are sampled randomly.
+    # `steps` is a constant parameter.
+
+    tuner = tune.Tuner(
+        easy_objective,
+        tune_config=tune.TuneConfig(
+            metric="mean_loss",
+            mode="min",
+            num_samples=50,
+        ),
+        param_space={
+            "steps": 10000,
+            "width": tune.uniform(0, 20),
+            "height": tune.uniform(-100, 100),
+            "activation": tune.grid_search(["relu", "tanh"]),
+        },
+    )
+    results = tuner.fit()
+
+    print("Best hyperparameters found were: ", results.get_best_result().config)