Skip to content

Commit c463ecc

Browse files
allenwang28facebook-github-bot
authored andcommitted
(5/N) Extend monarch_extension with RDMA (#545)
Summary: Pull Request resolved: #545 This diff: 1. Adds `monarch_rdma` to `monarch_extension` 2. Changes several tests to use `*_no_torch` build. Tests / targets that don't use the `_no_torch` build should be run with GPUs from here. Reviewed By: amirafzali Differential Revision: D78290956 fbshipit-source-id: c9425f723d88c1d8cecb3ef89acad3167064f296
1 parent cef93c7 commit c463ecc

File tree

8 files changed

+82
-26
lines changed

8 files changed

+82
-26
lines changed

.github/workflows/build-cuda.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ jobs:
3434
# Setup build environment (conda + system deps + rust + build deps)
3535
setup_build_environment
3636
37+
# Setup Tensor Engine
38+
setup_tensor_engine
39+
3740
# Build the process allocator binary
3841
build_process_allocator
3942

.github/workflows/test-cuda.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ jobs:
4444
chmod +x cargo_bin/process_allocator
4545
export PATH=$(pwd)/cargo_bin:$PATH
4646
47+
# Setup Tensor Engine dependencies
48+
setup_tensor_engine
49+
4750
# Install the built wheel from artifact
4851
install_wheel_from_artifact
4952

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ members = [
1212
"ndslice",
1313
"monarch_extension",
1414
"monarch_tensor_worker",
15+
"monarch_rdma",
1516
"nccl-sys",
1617
"rdmacore-sys",
1718
"torch-sys",
19+
"rdmacore-sys",
20+
"cuda-sys",
1821
]

cuda-sys/build.rs

Lines changed: 63 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ use std::path::PathBuf;
1313
use glob::glob;
1414
use which::which;
1515

16+
const PYTHON_PRINT_DIRS: &str = r"
17+
import sysconfig
18+
print('PYTHON_INCLUDE_DIR:', sysconfig.get_config_var('INCLUDEDIR'))
19+
print('PYTHON_LIB_DIR:', sysconfig.get_config_var('LIBDIR'))
20+
";
21+
1622
// Translated from torch/utils/cpp_extension.py
1723
fn find_cuda_home() -> Option<String> {
1824
// Guess #1
@@ -52,34 +58,50 @@ fn find_cuda_home() -> Option<String> {
5258
cuda_home
5359
}
5460

55-
fn main() {
56-
let cuda_home = find_cuda_home().expect("Could not find CUDA installation");
57-
58-
// Tell cargo to look for shared libraries in the CUDA directory
59-
println!("cargo:rustc-link-search={}/lib64", cuda_home);
60-
println!("cargo:rustc-link-search={}/lib", cuda_home);
61+
fn emit_cuda_link_directives(cuda_home: &str) {
62+
let stubs_path = format!("{}/lib64/stubs", cuda_home);
63+
if Path::new(&stubs_path).exists() {
64+
println!("cargo:rustc-link-search=native={}", stubs_path);
65+
} else {
66+
let lib64_path = format!("{}/lib64", cuda_home);
67+
if Path::new(&lib64_path).exists() {
68+
println!("cargo:rustc-link-search=native={}", lib64_path);
69+
}
70+
}
6171

62-
// Link against the CUDA libraries
6372
println!("cargo:rustc-link-lib=cuda");
6473
println!("cargo:rustc-link-lib=cudart");
74+
}
6575

66-
// Tell cargo to invalidate the built crate whenever the wrapper changes
67-
println!("cargo:rerun-if-changed=src/wrapper.h");
76+
fn python_env_dirs() -> (Option<String>, Option<String>) {
77+
let output = std::process::Command::new(PathBuf::from("python"))
78+
.arg("-c")
79+
.arg(PYTHON_PRINT_DIRS)
80+
.output()
81+
.unwrap_or_else(|_| panic!("error running python"));
6882

69-
// Add cargo metadata
70-
println!("cargo:rustc-cfg=cargo");
71-
println!("cargo:rustc-check-cfg=cfg(cargo)");
83+
let mut include_dir = None;
84+
let mut lib_dir = None;
85+
for line in String::from_utf8_lossy(&output.stdout).lines() {
86+
if let Some(path) = line.strip_prefix("PYTHON_INCLUDE_DIR: ") {
87+
include_dir = Some(path.to_string());
88+
}
89+
if let Some(path) = line.strip_prefix("PYTHON_LIB_DIR: ") {
90+
lib_dir = Some(path.to_string());
91+
}
92+
}
93+
(include_dir, lib_dir)
94+
}
7295

73-
// The bindgen::Builder is the main entry point to bindgen
74-
let bindings = bindgen::Builder::default()
96+
fn main() {
97+
let mut builder = bindgen::Builder::default()
7598
// The input header we would like to generate bindings for
7699
.header("src/wrapper.h")
77-
// Add the CUDA include directory
78-
.clang_arg(format!("-I{}/include", cuda_home))
79-
// Parse as C++
80100
.clang_arg("-x")
81101
.clang_arg("c++")
82102
.clang_arg("-std=gnu++20")
103+
.clang_arg(format!("-I{}/include", find_cuda_home().unwrap()))
104+
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
83105
// Allow the specified functions and types
84106
.allowlist_function("cu.*")
85107
.allowlist_function("CU.*")
@@ -89,16 +111,33 @@ fn main() {
89111
.default_enum_style(bindgen::EnumVariation::NewType {
90112
is_bitfield: false,
91113
is_global: false,
92-
})
93-
// Finish the builder and generate the bindings
94-
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
95-
.generate()
96-
// Unwrap the Result and panic on failure
97-
.expect("Unable to generate bindings");
114+
});
115+
116+
// Include headers and libs from the active environment.
117+
let (include_dir, lib_dir) = python_env_dirs();
118+
if let Some(include_dir) = include_dir {
119+
builder = builder.clang_arg(format!("-I{}", include_dir));
120+
}
121+
if let Some(lib_dir) = lib_dir {
122+
println!("cargo::rustc-link-search=native={}", lib_dir);
123+
// Set cargo metadata to inform dependent binaries about how to set their
124+
// RPATH (see controller/build.rs for an example).
125+
println!("cargo::metadata=LIB_PATH={}", lib_dir);
126+
}
127+
if let Some(cuda_home) = find_cuda_home() {
128+
emit_cuda_link_directives(&cuda_home);
129+
}
98130

99131
// Write the bindings to the $OUT_DIR/bindings.rs file
100132
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
101-
bindings
133+
builder
134+
.generate()
135+
.expect("Unable to generate bindings")
102136
.write_to_file(out_path.join("bindings.rs"))
103137
.expect("Couldn't write bindings!");
138+
139+
println!("cargo:rustc-link-lib=cuda");
140+
println!("cargo:rustc-link-lib=cudart");
141+
println!("cargo::rustc-cfg=cargo");
142+
println!("cargo::rustc-check-cfg=cfg(cargo)");
104143
}

monarch_extension/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiproces
2626
libc = "0.2.139"
2727
monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" }
2828
monarch_messages = { version = "0.0.0", path = "../monarch_messages", optional = true }
29+
monarch_rdma_extension = { version = "0.0.0", path = "../monarch_rdma/extension", optional = true }
2930
monarch_simulator_lib = { version = "0.0.0", path = "../monarch_simulator", optional = true }
3031
monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker", optional = true }
3132
monarch_types = { version = "0.0.0", path = "../monarch_types" }
@@ -41,4 +42,4 @@ tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
4142

4243
[features]
4344
default = ["tensor_engine"]
44-
tensor_engine = ["dep:controller", "dep:monarch_messages", "dep:monarch_simulator_lib", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
45+
tensor_engine = ["dep:controller", "dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_simulator_lib", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:torch-sys", "dep:torch-sys-cuda"]

monarch_extension/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> {
129129
module,
130130
"monarch_extension.mesh_controller",
131131
)?)?;
132+
monarch_rdma_extension::register_python_bindings(&get_or_add_new_module(module, "rdma")?)?;
132133
}
133134
simulation_tools::register_python_bindings(&get_or_add_new_module(
134135
module,

python/tests/_monarch/test_hyperactor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
1818

19-
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
19+
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension_no_torch
2020
AllocConstraints,
2121
AllocSpec,
2222
)

scripts/common-setup.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ install_wheel_from_artifact() {
5252
pip install "${RUNNER_ARTIFACT_DIR}"/*.whl
5353
}
5454

55+
# Setup and install dependencies for Tensor Engine
56+
setup_tensor_engine() {
57+
echo "Installing Tensor Engine dependencies..."
58+
dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
59+
}
60+
5561
# Build process allocator binary
5662
build_process_allocator() {
5763
echo "Building process allocator binary..."

0 commit comments

Comments
 (0)