Skip to content

Commit a499837

Browse files
dstaay-fbfacebook-github-bot
authored andcommitted
Util to validate Execution Env (#541)
Summary: This adds a test util (run on test setups) to ensure we correct driver access and flags populated according. LSS: we need nv_peermem AND PeerMappingOverride=1 set. For tests to pass, needed to set to local_enabled Reviewed By: allenwang28 Differential Revision: D78362879
1 parent 438a1e6 commit a499837

File tree

2 files changed

+34
-4
lines changed

2 files changed

+34
-4
lines changed

monarch_rdma/Cargo.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,3 @@ hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
2020
ndslice = { version = "0.0.0", path = "../ndslice" }
2121
timed_test = { version = "0.0.0", path = "../timed_test" }
2222
tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
23-
24-
[features]
25-
cuda = []
26-
default = ["cuda"]

monarch_rdma/src/test_utils.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,40 @@ pub mod test_utils {
3131
use crate::rdma_manager_actor::RdmaManagerActor;
3232
use crate::rdma_manager_actor::RdmaManagerMessageClient;
3333

34+
// Utility to validate execution context. Remote Exectuion environments do
35+
// not always have access to the nvidia_peermem module and/or set the PeerMappingOverride
36+
// parameter due to security. This function can be used to validate that the execution context when
37+
// running the tests that need this functionality (ie. cudaHostRegisterIoMemory)
38+
pub async fn validate_execution_context() -> Result<(), anyhow::Error> {
39+
// Check for nvidia peermem
40+
match std::fs::read_to_string("/proc/modules") {
41+
Ok(contents) => {
42+
if !contents.contains("nvidia_peermem") {
43+
return Err(anyhow::anyhow!(
44+
"nvidia_peermem module not found in /proc/modules"
45+
));
46+
}
47+
}
48+
Err(e) => {
49+
return Err(anyhow::anyhow!(e));
50+
}
51+
}
52+
53+
// Test file access to nvidia params
54+
match std::fs::read_to_string("/proc/driver/nvidia/params") {
55+
Ok(contents) => {
56+
if !contents.contains("PeerMappingOverride=1") {
57+
return Err(anyhow::anyhow!(
58+
"PeerMappingOverride=1 not found in /proc/driver/nvidia/params"
59+
));
60+
}
61+
}
62+
Err(e) => {
63+
return Err(anyhow::anyhow!(e));
64+
}
65+
}
66+
Ok(())
67+
}
3468
// Waits for the completion of an RDMA operation.
3569

3670
// This function polls for the completion of an RDMA operation by repeatedly

0 commit comments

Comments
 (0)