diff --git a/collector/src/bin/collector.rs b/collector/src/bin/collector.rs index e91720dcd..93ba85cac 100644 --- a/collector/src/bin/collector.rs +++ b/collector/src/bin/collector.rs @@ -65,8 +65,13 @@ use database::{ CommitType, Connection, Pool, }; +/// Directory used to cache downloaded Rust toolchains on disk. const TOOLCHAIN_CACHE_DIRECTORY: &str = "cache"; +/// Maximum allowed number of toolchains in the toolchain cache directory. +/// If the directory will have more toolchains, it will be purged. +const TOOLCHAIN_CACHE_MAX_TOOLCHAINS: usize = 30; + fn n_normal_benchmarks_remaining(n: usize) -> String { let suffix = if n == 1 { "" } else { "s" }; format!("{n} normal benchmark{suffix} remaining") @@ -1269,15 +1274,8 @@ fn main_result() -> anyhow::Result { } Commands::InstallNext { codegen_backends } => { - let last_sha = Command::new("git") - .arg("ls-remote") - .arg("https://github.com/rust-lang/rust.git") - .arg("master") - .output() - .unwrap(); - let last_sha = String::from_utf8(last_sha.stdout).expect("utf8"); - let last_sha = last_sha.split_whitespace().next().expect(&last_sha); - let commit = get_commit_or_fake_it(last_sha).expect("success"); + let last_sha = get_latest_sha("https://github.com/rust-lang/rust").unwrap(); + let commit = get_commit_or_fake_it(&last_sha).expect("success"); let rt = build_async_runtime(); let mut sysroot = rt @@ -1434,6 +1432,8 @@ async fn run_job_queue_benchmarks( all_compile_benchmarks: Vec, check_git_sha: bool, ) -> anyhow::Result<()> { + let _ = tidy_toolchain_cache_dir(); + let mut last_request_tag = None; while let Some((benchmark_job, artifact_id)) = conn @@ -1444,20 +1444,25 @@ async fn run_job_queue_benchmarks( ) .await? { + // Are we benchmarking a different benchmark request than in the previous iteration of the + // loop? + let is_new_request = last_request_tag.is_some() + && last_request_tag.as_deref() != Some(benchmark_job.request_tag()); + if is_new_request { + let _ = tidy_toolchain_cache_dir(); + } + // Here we check if we should update our commit SHA, if rustc-perf has been updated. // We only check for updates when we switch *benchmark requests*, not *benchmark jobs*, // to avoid changing code in the middle of benchmarking the same request. // Note that if an update happens, the job that we have just dequeued will have its deque // counter increased. But since updates are relatively rare, that shouldn't be a big deal, // it will be dequeued again when the collector starts again. - if check_git_sha - && last_request_tag.is_some() - && last_request_tag.as_deref() != Some(benchmark_job.request_tag()) - && needs_git_update(collector) - { + if check_git_sha && is_new_request && needs_git_update(collector) { log::warn!("Exiting collector to update itself from git."); return Ok(()); } + last_request_tag = Some(benchmark_job.request_tag().to_string()); log::info!("Dequeued job {benchmark_job:?}, artifact_id {artifact_id:?}"); @@ -1523,6 +1528,23 @@ async fn run_job_queue_benchmarks( Ok(()) } +/// Check the toolchain cache directory and delete it if it grows too large. +/// Currently, we just assume that "too large" means "has more than N toolchains". +fn tidy_toolchain_cache_dir() -> std::io::Result<()> { + let dir_count = Path::new(TOOLCHAIN_CACHE_DIRECTORY) + .read_dir()? + .filter_map(|e| e.ok()) + .filter_map(|d| d.file_type().ok()) + .filter(|t| t.is_dir()) + .count(); + if dir_count > TOOLCHAIN_CACHE_MAX_TOOLCHAINS { + log::warn!("Purging toolchain cache directory at {TOOLCHAIN_CACHE_DIRECTORY}"); + // Just remove the whole directory, to avoid having to figure out which toolchains are old + std::fs::remove_dir_all(TOOLCHAIN_CACHE_DIRECTORY)?; + } + Ok(()) +} + /// Returns true if the commit SHA of collector does not match the latest commit SHA of the master /// branch of https://github.com/rust-lang/rustc-perf. fn needs_git_update(collector: &CollectorConfig) -> bool { @@ -1530,21 +1552,8 @@ fn needs_git_update(collector: &CollectorConfig) -> bool { return false; }; - let mut cmd = Command::new("git"); - cmd.arg("ls-remote") - .arg("https://github.com/rust-lang/rustc-perf") - .arg("HEAD"); - let upstream_sha = match command_output(&mut cmd) { - Ok(output) => String::from_utf8(output.stdout) - .unwrap() - .split_whitespace() - .next() - .unwrap() - .to_string(), - Err(error) => { - log::error!("Cannot determine latest SHA of rustc-perf: {error:?}"); - return false; - } + let Ok(upstream_sha) = get_latest_sha("https://github.com/rust-lang/rustc-perf") else { + return false; }; if commit_sha != upstream_sha { log::warn!( @@ -1556,6 +1565,23 @@ fn needs_git_update(collector: &CollectorConfig) -> bool { } } +/// Returns the latest known sha of the default branch of the specified `repo`. +fn get_latest_sha(repo: &str) -> anyhow::Result { + let mut cmd = Command::new("git"); + cmd.arg("ls-remote").arg(repo).arg("HEAD"); + match command_output(&mut cmd) { + Ok(output) => Ok(String::from_utf8(output.stdout)? + .split_whitespace() + .next() + .unwrap() + .to_string()), + Err(error) => { + log::error!("Cannot determine latest SHA of {repo}: {error:?}"); + Err(error) + } + } +} + /// Error that happened during benchmarking of a job. enum BenchmarkJobError { /// The error is non-recoverable. @@ -1606,8 +1632,6 @@ async fn run_benchmark_job( }; // Avoid redownloading the same sysroot multiple times for different jobs, even // across collector restarts. - - // TODO: Periodically clear the cache directory to avoid running out of disk space. sysroot.preserve(); Toolchain::from_sysroot(&sysroot, commit.sha.clone()) } diff --git a/collector/src/toolchain.rs b/collector/src/toolchain.rs index 2f2caed3d..946312487 100644 --- a/collector/src/toolchain.rs +++ b/collector/src/toolchain.rs @@ -43,7 +43,13 @@ impl Sysroot { triple: &str, backends: &[CodegenBackend], ) -> Result { - let cache_directory = cache_directory.join(triple).join(&sha); + // The structure of this directory is load-bearing. + // We use the commit SHA as the top-level key, to have a quick way of estimating how many + // toolchains have been installed in the cache directory. + // We also use a nested directory below the target tuple, because rustc outputs weird things + // when we query it with `--print sysroot` and its sysroot is located in a directory that + // corresponds to a valid target name. + let cache_directory = cache_directory.join(&sha).join(triple).join("toolchain"); fs::create_dir_all(&cache_directory).map_err(|e| SysrootDownloadError::IO(e.into()))?; let download = SysrootDownload {