-
Notifications
You must be signed in to change notification settings - Fork 17
Add --all-ranks CLI infrastructure and core processing logic #115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
15a74df
4dfe60e
cae8159
43e2cb3
4ba11f4
eaff4df
52d8260
8f4b1b6
5bbf21c
24c52cd
13f0f5c
03e06e0
a529cd2
44cd79e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,6 +6,24 @@ use std::path::PathBuf; | |||||
|
||||||
use tlparse::{parse_path, ParseConfig}; | ||||||
|
||||||
// Main output filename used by both single rank and multi-rank processing | ||||||
const MAIN_OUTPUT_FILENAME: &str = "index.html"; | ||||||
|
||||||
// Helper function to setup output directory (handles overwrite logic) | ||||||
fn setup_output_directory(out_path: &PathBuf, overwrite: bool) -> anyhow::Result<()> { | ||||||
if out_path.exists() { | ||||||
if !overwrite { | ||||||
bail!( | ||||||
"Directory {} already exists, use -o OUTDIR to write to another location or pass --overwrite to overwrite the old contents", | ||||||
out_path.display() | ||||||
); | ||||||
} | ||||||
fs::remove_dir_all(&out_path)?; | ||||||
} | ||||||
fs::create_dir(&out_path)?; | ||||||
Ok(()) | ||||||
} | ||||||
|
||||||
#[derive(Parser)] | ||||||
#[command(author, version, about, long_about = None)] | ||||||
#[command(propagate_version = true)] | ||||||
|
@@ -47,12 +65,20 @@ pub struct Cli { | |||||
/// For inductor provenance tracking highlighter | ||||||
#[arg(short, long)] | ||||||
inductor_provenance: bool, | ||||||
/// Parse all ranks and generate a single unified HTML page | ||||||
#[arg(long)] | ||||||
all_ranks_html: bool, | ||||||
} | ||||||
|
||||||
fn main() -> anyhow::Result<()> { | ||||||
let cli = Cli::parse(); | ||||||
|
||||||
if cli.all_ranks_html { | ||||||
return handle_all_ranks(cli); | ||||||
} | ||||||
|
||||||
let path = if cli.latest { | ||||||
let input_path = cli.path; | ||||||
let input_path = &cli.path; | ||||||
// Path should be a directory | ||||||
if !input_path.is_dir() { | ||||||
bail!( | ||||||
|
@@ -61,7 +87,7 @@ fn main() -> anyhow::Result<()> { | |||||
); | ||||||
} | ||||||
|
||||||
let last_modified_file = std::fs::read_dir(&input_path) | ||||||
let last_modified_file = std::fs::read_dir(input_path) | ||||||
.with_context(|| format!("Couldn't access directory {}", input_path.display()))? | ||||||
.flatten() | ||||||
.filter(|f| f.metadata().unwrap().is_file()) | ||||||
|
@@ -72,45 +98,191 @@ fn main() -> anyhow::Result<()> { | |||||
}; | ||||||
last_modified_file.path() | ||||||
} else { | ||||||
cli.path | ||||||
cli.path.clone() | ||||||
}; | ||||||
|
||||||
let out_path = cli.out; | ||||||
let out_path = cli.out.clone(); | ||||||
setup_output_directory(&out_path, cli.overwrite)?; | ||||||
|
||||||
if out_path.exists() { | ||||||
if !cli.overwrite { | ||||||
bail!( | ||||||
"Directory {} already exists, use -o OUTDIR to write to another location or pass --overwrite to overwrite the old contents", | ||||||
out_path.display() | ||||||
); | ||||||
} | ||||||
fs::remove_dir_all(&out_path)?; | ||||||
// Use handle_one_rank for single rank processing (don't create directory since it already exists) | ||||||
handle_one_rank(&path, &out_path, &cli, false)?; | ||||||
|
||||||
if !cli.no_browser { | ||||||
opener::open(out_path.join("index.html"))?; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's MAIN_OUTPUT_FILENAME for? |
||||||
} | ||||||
Ok(()) | ||||||
} | ||||||
|
||||||
// Helper function to handle parsing and writing output for a single rank | ||||||
// Returns the relative path to the main output file within the rank directory | ||||||
fn handle_one_rank( | ||||||
xmfan marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
rank_path: &PathBuf, | ||||||
rank_out_dir: &PathBuf, | ||||||
cli: &Cli, | ||||||
create_output_dir: bool, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Try rewriting your callsites, so that this function either always creates the output directory, or the callsite always creates the output directory before calling into this. Hint: can you rewrite all your newly added directory creating logic using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to clarify, this would mean to remove the create_output_dir parameter and move directory creation to the callsites: Single rank: uses existing setup_output_directory Is this what you meant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that, or you could always call setup_output_directory from within handle_one_rank |
||||||
) -> anyhow::Result<PathBuf> { | ||||||
if create_output_dir { | ||||||
fs::create_dir(rank_out_dir)?; | ||||||
} | ||||||
fs::create_dir(&out_path)?; | ||||||
|
||||||
let config = ParseConfig { | ||||||
strict: cli.strict, | ||||||
strict_compile_id: cli.strict_compile_id, | ||||||
custom_parsers: Vec::new(), | ||||||
custom_header_html: cli.custom_header_html, | ||||||
custom_header_html: cli.custom_header_html.clone(), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why clone now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not mistaken, this is necessary because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The emphasis of the question is on "why now" rather than just "why", the code compiled and seemed to work already before. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think what happened is during your changes, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to clarify, in my current cli.rs file, I would need the clone in handle_one_rank() because handle_one_rank takes in &Cli and can't move the String out of a borrowed struct. Is this right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The clone happens because you want to create ParseConfig using a Cli ref, but do you need 1 ParseConfig per rank? Hint: Are there differences between the ParseConfig created for each rank? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm since ParseConfig is the same for all ranks, I could create ParseConfig once before processing ranks. And then pass that to handle_one_rank instead of passing cli. This would avoid repeated cloning of custom_header_html. Is this the direction you're suggesting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that sounds reasonable |
||||||
verbose: cli.verbose, | ||||||
plain_text: cli.plain_text, | ||||||
export: cli.export, | ||||||
inductor_provenance: cli.inductor_provenance, | ||||||
all_ranks: false, | ||||||
}; | ||||||
|
||||||
let output = parse_path(&path, config)?; | ||||||
let output = parse_path(rank_path, config)?; | ||||||
|
||||||
let mut main_output_path = None; | ||||||
|
||||||
for (filename, path) in output { | ||||||
let out_file = out_path.join(filename); | ||||||
// Write output files to output directory | ||||||
for (filename, content) in output { | ||||||
let out_file = rank_out_dir.join(&filename); | ||||||
if let Some(dir) = out_file.parent() { | ||||||
fs::create_dir_all(dir)?; | ||||||
} | ||||||
fs::write(out_file, path)?; | ||||||
fs::write(out_file, content)?; | ||||||
|
||||||
// Track the main output file (typically index.html) | ||||||
if filename.file_name().and_then(|name| name.to_str()) == Some(MAIN_OUTPUT_FILENAME) { | ||||||
main_output_path = Some(filename); | ||||||
} | ||||||
} | ||||||
|
||||||
if !cli.no_browser { | ||||||
opener::open(out_path.join("index.html"))?; | ||||||
Ok(main_output_path.unwrap_or_else(|| PathBuf::from(MAIN_OUTPUT_FILENAME))) | ||||||
} | ||||||
|
||||||
// handle_all_ranks function with placeholder landing page | ||||||
fn handle_all_ranks(cli: Cli) -> anyhow::Result<()> { | ||||||
let input_path = &cli.path; | ||||||
|
||||||
if !input_path.is_dir() { | ||||||
bail!( | ||||||
"Input path {} must be a directory when using --all-ranks-html", | ||||||
input_path.display() | ||||||
); | ||||||
} | ||||||
|
||||||
let out_path = &cli.out; | ||||||
setup_output_directory(out_path, cli.overwrite)?; | ||||||
|
||||||
// Find all rank log files in the directory | ||||||
let rank_files: Vec<_> = std::fs::read_dir(input_path) | ||||||
.with_context(|| format!("Couldn't access directory {}", input_path.display()))? | ||||||
.flatten() | ||||||
.filter(|entry| { | ||||||
let path = entry.path(); | ||||||
if !path.is_file() { | ||||||
return false; | ||||||
} | ||||||
|
||||||
let Some(filename) = path.file_name().and_then(|name| name.to_str()) else { | ||||||
return false; | ||||||
}; | ||||||
|
||||||
// Only support PyTorch TORCH_TRACE files: dedicated_log_torch_trace_rank_0_hash.log | ||||||
if !filename.starts_with("dedicated_log_torch_trace_rank_") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|| !filename.ends_with(".log") | ||||||
{ | ||||||
return false; | ||||||
} | ||||||
|
||||||
// Extract rank number from the pattern | ||||||
let after_prefix = &filename[31..]; // Remove "dedicated_log_torch_trace_rank_" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. strip_prefix? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. look at line 225, looks very very similar. could we avoid computing things twice? Maybe it's easier to express if you didn't use .filter here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid computing things twice, I'm intending to use filter_map to extract rank numbers once during collection, then returning Vec<(DirEntry, String)>. Is this approach feasible to address your feedback? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah that can work. I was just thinking of falling back this logic to a foreach loop |
||||||
if let Some(underscore_pos) = after_prefix.find('_') { | ||||||
let rank_part = &after_prefix[..underscore_pos]; | ||||||
return !rank_part.is_empty() && rank_part.chars().all(|c| c.is_ascii_digit()); | ||||||
} | ||||||
|
||||||
false | ||||||
}) | ||||||
.collect(); | ||||||
|
||||||
if rank_files.is_empty() { | ||||||
bail!( | ||||||
"No rank log files found in directory {}", | ||||||
input_path.display() | ||||||
); | ||||||
} | ||||||
|
||||||
let mut rank_links = Vec::new(); | ||||||
|
||||||
// Process each rank file | ||||||
for rank_file in rank_files { | ||||||
let rank_path = rank_file.path(); | ||||||
let rank_name = rank_path | ||||||
.file_stem() | ||||||
.and_then(|name| name.to_str()) | ||||||
.unwrap_or("unknown"); | ||||||
|
||||||
// Extract rank number from PyTorch TORCH_TRACE filename | ||||||
let rank_num = | ||||||
if let Some(after_prefix) = rank_name.strip_prefix("dedicated_log_torch_trace_rank_") { | ||||||
if let Some(underscore_pos) = after_prefix.find('_') { | ||||||
let rank_part = &after_prefix[..underscore_pos]; | ||||||
if rank_part.is_empty() || !rank_part.chars().all(|c| c.is_ascii_digit()) { | ||||||
bail!( | ||||||
"Could not extract rank number from TORCH_TRACE filename: {}", | ||||||
rank_name | ||||||
); | ||||||
} | ||||||
rank_part.to_string() | ||||||
} else { | ||||||
bail!("Invalid TORCH_TRACE filename format: {}", rank_name); | ||||||
} | ||||||
} else { | ||||||
bail!( | ||||||
"Filename does not match PyTorch TORCH_TRACE pattern: {}", | ||||||
rank_name | ||||||
); | ||||||
}; | ||||||
|
||||||
println!( | ||||||
"Processing rank {} from file: {}", | ||||||
rank_num, | ||||||
rank_path.display() | ||||||
); | ||||||
|
||||||
// Create subdirectory for this rank and handle parsing | ||||||
let rank_out_dir = out_path.join(format!("rank_{rank_num}")); | ||||||
let main_output_path = handle_one_rank(&rank_path, &rank_out_dir, &cli, true)?; | ||||||
|
||||||
// Add link to this rank's page using the actual output path from handle_one_rank | ||||||
let rank_link = format!("rank_{}/{}", rank_num, main_output_path.display()); | ||||||
rank_links.push((rank_num.clone(), rank_link)); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
// Sort rank links by rank number | ||||||
rank_links.sort_by(|a, b| { | ||||||
let a_num: i32 = | ||||||
a.0.parse() | ||||||
.expect(&format!("Failed to parse rank number from '{}'", a.0)); | ||||||
let b_num: i32 = | ||||||
b.0.parse() | ||||||
.expect(&format!("Failed to parse rank number from '{}'", b.0)); | ||||||
a_num.cmp(&b_num) | ||||||
}); | ||||||
|
||||||
// Core logic complete - no HTML generation yet | ||||||
// TODO - Add landing page HTML generation using template system | ||||||
|
||||||
println!( | ||||||
"Generated multi-rank report with {} ranks", | ||||||
rank_links.len() | ||||||
); | ||||||
println!("Individual rank reports available in:"); | ||||||
for (rank_num, _) in &rank_links { | ||||||
println!(" - rank_{}/index.html", rank_num); | ||||||
} | ||||||
|
||||||
// No browser opening since no landing page yet | ||||||
// TODO - Generate landing page and open browser | ||||||
|
||||||
Ok(()) | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doesn't look like this change is needed