From 940ae3cabaae0a3dac1b3ae2eb81a415d894d461 Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Fri, 19 Jun 2026 15:25:12 -0600 Subject: [PATCH 1/6] Don't panic when a finished compile has neither code nor signal get_signal did `status.signal().expect("must have signal")`, assuming the Unix invariant that an ExitStatus with no exit code was terminated by a signal. That does not always hold: an ExitStatus reconstructed for a distributed compile (or an abnormal wait status such as WIFSTOPPED) can report neither a code nor a signal. When that happened the expect() panicked the compile task, which the server surfaced as a misleading "Failed to bind socket" and, under load, repeatedly fell back to local compilation. Return Option from get_signal and assign it straight into res.signal, so a compile that reports neither code nor signal leaves res.signal unset instead of crashing the in-flight task. The Windows arm returns None rather than panicking; ExitStatus::code() is always Some there, so the signal branch is never reached anyway. Add a unit test covering a real terminating signal (SIGKILL) and the neither-code-nor-signal case (WIFSTOPPED via from_raw), which previously panicked. Signed-off-by: Javier Tia --- src/server.rs | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/server.rs b/src/server.rs index 98cedfbbc..25ba43a9a 100644 --- a/src/server.rs +++ b/src/server.rs @@ -130,13 +130,20 @@ fn notify_server_startup(name: Option<&OsString>, status: ServerStartup) -> Resu } #[cfg(unix)] -fn get_signal(status: ExitStatus) -> i32 { +fn get_signal(status: ExitStatus) -> Option { use std::os::unix::prelude::*; - status.signal().expect("must have signal") + // None when the process produced neither an exit code nor a terminating + // signal - e.g. an ExitStatus synthesized from a distributed-compile result, + // or an abnormal wait status. Previously this was `.expect("must have + // signal")`, which panicked the compile task (surfacing as a misleading + // "Failed to bind socket") and could be tripped repeatedly under load. + status.signal() } #[cfg(windows)] -fn get_signal(_status: ExitStatus) -> i32 { - panic!("no signals on windows") +fn get_signal(_status: ExitStatus) -> Option { + // On Windows ExitStatus::code() is always Some, so the signal branch is + // never reached; return None rather than panicking. + None } pub struct DistClientContainer { @@ -1461,7 +1468,7 @@ where match status.code() { Some(code) => res.retcode = Some(code), - None => res.signal = Some(get_signal(status)), + None => res.signal = get_signal(status), } res.stdout = stdout; @@ -1477,7 +1484,7 @@ where match output.status.code() { Some(code) => res.retcode = Some(code), - None => res.signal = Some(get_signal(output.status)), + None => res.signal = get_signal(output.status), } res.stdout = output.stdout; res.stderr = output.stderr; @@ -2282,6 +2289,26 @@ fn waits_until_zero() { mod tests { use super::*; + #[cfg(unix)] + #[test] + fn test_get_signal_handles_signal_and_abnormal_status() { + use std::os::unix::process::ExitStatusExt; + + // Terminated by SIGKILL: a real terminating signal is reported. + let killed = ExitStatus::from_raw(9); + assert_eq!(killed.code(), None); + assert_eq!(get_signal(killed), Some(9)); + + // A wait status that is neither a normal exit (code) nor a terminating + // signal - here WIFSTOPPED (low byte 0x7f). The same neither-code-nor- + // signal shape can arise from an ExitStatus synthesized for a + // distributed compile. get_signal must return None, not panic + // "must have signal" (which used to crash the in-flight compile task). + let abnormal = ExitStatus::from_raw(0x7f); + assert_eq!(abnormal.code(), None); + assert_eq!(get_signal(abnormal), None); + } + struct StringWriter { buffer: String, } From 5e244f726cb9f623e55954f7b4f2a804733838c6 Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Fri, 19 Jun 2026 17:38:39 -0600 Subject: [PATCH 2/6] Bundle relocated interpreter's libdir into the dist toolchain package sccache-dist packages a toolchain's shared libraries by parsing `ldd` output, which resolves NEEDED libraries against the host's dynamic loader. Yocto/OpenEmbedded "uninative" cross toolchains ship a relocated glibc whose loader has a built-in search path pointing at its own sysroot. For those binaries `ldd` reports host paths (e.g. /usr/lib/libm.so.6), yet inside the build sandbox the relocated loader searches its own sysroot lib dir, where those libraries were never packaged. The remote compile then dies with "libm.so.6: cannot open shared object file" and silently falls back to local compilation, so distribution never actually runs. When a packaged executable's PT_INTERP lives outside the standard host loader directories, also bundle the interpreter's own directory. That directory holds the libc/libm the relocated loader resolves against, so they land at the absolute path the loader searches inside the sandbox. Standard host toolchains are untouched: their interpreter is under /lib, /lib64, or /usr/lib, so the existing ldd-only path is preserved. Signed-off-by: Javier Tia --- src/dist/pkg.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/dist/pkg.rs b/src/dist/pkg.rs index 8bc1744a4..e7cab535c 100644 --- a/src/dist/pkg.rs +++ b/src/dist/pkg.rs @@ -98,6 +98,20 @@ mod toolchain_imp { } pub fn add_executable_and_deps(&mut self, executable: PathBuf) -> Result<()> { + // A relocated program interpreter (e.g. a Yocto/OE uninative + // toolchain) resolves its libc/libm against its own sysroot, not the + // host paths ldd reports. Bundle the interpreter's directory so those + // libraries exist where the loader looks inside the build sandbox. + if let Some(libdir) = read_elf_interpreter(&executable) + .and_then(|interp| relocated_interpreter_libdir(&interp)) + { + self.add_dir_contents(&libdir).with_context(|| { + format!( + "Failed to bundle relocated interpreter libdir {}", + libdir.display() + ) + })?; + } let mut remaining = vec![executable]; while let Some(obj_path) = remaining.pop() { assert!(obj_path.is_absolute()); @@ -350,6 +364,50 @@ mod toolchain_imp { libs } + /// Read a binary's ELF program interpreter (PT_INTERP), if it has one. + /// + /// Returns `None` for static binaries, non-ELF or non-64-bit files, and + /// anything that fails to parse -- callers treat that as "nothing special + /// to bundle", preserving the default ldd-only behaviour. + fn read_elf_interpreter(executable: &Path) -> Option { + use object::Endianness; + use object::read::elf::{ElfFile64, ProgramHeader}; + + let data = fs::read(executable).ok()?; + let elf = ElfFile64::::parse(data.as_slice()).ok()?; + let endian = elf.endian(); + for header in elf.elf_program_headers() { + if let Ok(Some(interp)) = header.interpreter(endian, elf.data()) { + return str::from_utf8(interp).ok().map(PathBuf::from); + } + } + None + } + + /// If `interp` is a relocated program interpreter -- one living outside the + /// standard host loader directories -- return the directory that should be + /// bundled alongside the toolchain. + /// + /// Yocto/OpenEmbedded "uninative" cross toolchains ship their own glibc and + /// loader, and the loader's built-in search path points at its own sysroot + /// lib dir rather than the host's. `ldd` resolves a binary's NEEDED + /// libraries against the *host* loader, so for these binaries it reports + /// host paths (e.g. /usr/lib/libm.so.6) that do not exist where the + /// relocated loader actually searches at runtime. Bundling the + /// interpreter's own directory -- which holds the matching libc/libm and + /// the loader itself -- makes the toolchain resolvable inside the sandbox. + fn relocated_interpreter_libdir(interp: &Path) -> Option { + const STANDARD_LOADER_PREFIXES: &[&str] = &["/lib/", "/lib64/", "/usr/lib/", "/usr/lib64/"]; + let interp_str = interp.to_str()?; + if STANDARD_LOADER_PREFIXES + .iter() + .any(|prefix| interp_str.starts_with(prefix)) + { + return None; + } + interp.parent().map(Path::to_path_buf) + } + #[test] fn test_ldd_parse() { let ubuntu_ls_output = "\tlinux-vdso.so.1 => (0x00007fffcfffe000) @@ -409,6 +467,35 @@ mod toolchain_imp { ] ); } + + #[test] + fn test_relocated_interpreter_libdir() { + // Standard host loaders are left to ldd's host resolution. + for standard in [ + "/lib64/ld-linux-x86-64.so.2", + "/usr/lib64/ld-linux-x86-64.so.2", + "/usr/lib/ld-linux-aarch64.so.1", + "/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2", + ] { + assert_eq!( + relocated_interpreter_libdir(Path::new(standard)), + None, + "{standard} is a standard host loader and must not trigger bundling" + ); + } + + // A relocated loader (e.g. a Yocto/OE uninative toolchain) resolves its + // libc/libm against its own sysroot lib dir, which ldd does not report. + // Return that directory so it gets bundled into the toolchain package. + assert_eq!( + relocated_interpreter_libdir(Path::new( + "/home/u/build/tmp/sysroots-uninative/x86_64-linux/lib/ld-linux-x86-64.so.2" + )), + Some(PathBuf::from( + "/home/u/build/tmp/sysroots-uninative/x86_64-linux/lib" + )) + ); + } } pub fn make_tar_header(src: &Path, dest: &str) -> io::Result { From 88d709c6b0223e365ba305c60cfdef753e16a368 Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Tue, 23 Jun 2026 13:02:45 -0600 Subject: [PATCH 3/6] compiler/gcc: fix distributed compile of relative input paths sccache-dist ships the preprocessed input through the inputs packager keyed on the absolute, simplified path cwd.join(input) (CInputsPackager), but the distributed compile command referenced the raw parsed_args.input. For out-of-tree builds the input is relative (e.g. OpenEmbedded's ../sources/foo.c), so the command and the packaged input disagreed and the build-server compiled a path the inputs were never placed at, failing with "cc1: fatal error: ... No such file or directory". Transform the same absolute, simplified path in the dist command so it matches the packaged input. An absolute input is unchanged, since cwd.join of an absolute path returns it verbatim. Signed-off-by: Javier Tia --- src/compiler/gcc.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/src/compiler/gcc.rs b/src/compiler/gcc.rs index 7bd4f9689..7e85210f3 100644 --- a/src/compiler/gcc.rs +++ b/src/compiler/gcc.rs @@ -1021,7 +1021,14 @@ where } arguments.extend(vec![ parsed_args.compilation_flag.clone().into_string().ok()?, - path_transformer.as_dist(&parsed_args.input)?, + // Match the path the inputs packager ships the preprocessed + // content at (CInputsPackager uses cwd.join(input) + simplify_path). + // parsed_args.input is relative for out-of-tree builds (e.g. OE's + // `../sources/foo.c`); passing it raw made the server look for the + // input at a path the package never placed it, failing with + // "No such file or directory". Absolutize+simplify so both agree. + path_transformer + .as_dist(&dist::pkg::simplify_path(&cwd.join(&parsed_args.input)).ok()?)?, "-o".into(), path_transformer.as_dist(out_file)?, ]); @@ -2478,6 +2485,66 @@ mod test { assert_eq!(0, creator.lock().unwrap().children.len()); } + #[test] + #[cfg(feature = "dist-client")] + fn test_compile_relative_input_dist_command_is_absolute() { + // Regression: the dist command must reference the same absolute, simplified + // input path the inputs packager ships the preprocessed content at + // (CInputsPackager uses cwd.join(input) + simplify_path). For out-of-tree + // builds the input is relative (e.g. OE's `../sources/foo.c`); passing it + // raw made the build-server look where the input was never placed, failing + // with "No such file or directory" (OE xz/glibc out-of-tree compiles). + let f = TestFixture::new(); + let parsed_args = ParsedArguments { + input: "../foo.c".into(), + double_dash_input: false, + language: Language::C, + compilation_flag: "-c".into(), + depfile: None, + outputs: vec![( + "obj", + ArtifactDescriptor { + path: "foo.o".into(), + optional: false, + }, + )] + .into_iter() + .collect(), + dependency_args: vec![], + preprocessor_args: vec![], + common_args: vec![], + arch_args: vec![], + unhashed_args: vec![], + extra_dist_files: vec![], + extra_hash_files: vec![], + msvc_show_includes: false, + profile_generate: false, + color_mode: ColorMode::Auto, + suppress_rewrite_includes_only: false, + too_hard_for_preprocessor_cache_mode: None, + }; + let mut path_transformer = dist::PathTransformer::new(); + let (_command, dist_command, _cacheable) = generate_compile_commands( + &mut path_transformer, + &f.bins[0], + &parsed_args, + f.tempdir.path(), + &[], + CCompilerKind::Gcc, + false, + language_to_gcc_arg, + ) + .unwrap(); + let dist_command = dist_command.expect("relative input must still produce a dist command"); + // No argument may carry an unresolved `..`: the input must be the simplified + // absolute path the packager ships, not the raw relative one. + assert!( + !dist_command.arguments.iter().any(|a| a.contains("..")), + "dist command still references a relative input: {:?}", + dist_command.arguments + ); + } + #[test] fn test_compile_simple_verbose_short() { let creator = new_creator(); From 37a742d7265dbbd69f6e7661a8dc541743e53bf0 Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Tue, 23 Jun 2026 14:09:55 -0600 Subject: [PATCH 4/6] compiler: log distributed-compile decisions at info level sccache logged the distribute-vs-local decision only at debug ("Compiling locally", "Attempting distributed compilation"), while an infrastructure fallback warned. At info a successful distribution and a local compile were both silent, so the only visible dist signal was the failure path - leaving no way to see the distribute/fallback ratio without the full debug firehose. Diagnosing why a distributed build under-distributes meant guessing. Promote the decision points to info and add a log on successful distribution naming the server and exit code, so SCCACHE_LOG=info gives a per-compile dist trace (attempt then distributed-on-server, compiled-locally, or falling-back-with-reason). sccache only emits logs when SCCACHE_LOG is set, so default runs are unaffected. Signed-off-by: Javier Tia --- src/compiler/compiler.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/compiler/compiler.rs b/src/compiler/compiler.rs index 78bb5a433..77eea183e 100644 --- a/src/compiler/compiler.rs +++ b/src/compiler/compiler.rs @@ -877,7 +877,10 @@ where let dist_client = match dist_compile_cmd.clone().and(dist_client) { Some(dc) => dc, None => { - debug!("[{}]: Compiling locally", out_pretty); + info!( + "[{}]: Compiling locally (not eligible for distributed compilation)", + out_pretty + ); return compile_cmd .execute(service, &creator) .await @@ -885,7 +888,7 @@ where } }; - debug!("[{}]: Attempting distributed compilation", out_pretty); + info!("[{}]: Attempting distributed compilation", out_pretty); let out_pretty2 = out_pretty.clone(); let local_executable = compile_cmd.get_executable(); @@ -1044,6 +1047,12 @@ where .splice(0..0, server_info.as_bytes().to_vec()); } + info!( + "[{}]: Distributed compilation finished on {} (exit code {})", + out_pretty, + server_id.addr(), + jc.output.code + ); Ok((DistType::Ok(server_id), jc.output.into())) }; From 425db1a6ac35e7955a27c795064d60e33e41c691 Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Tue, 23 Jun 2026 15:03:30 -0600 Subject: [PATCH 5/6] compiler: fall back to local on distributed-compile failure A distributed compile the build-server rejects is often a distribution artifact, not a genuine compiler error: an object that .incbin's a binary the inputs packager does not ship - the kernel's vdso, embedded-config, and dtb wrappers - cannot be assembled remotely and returns non-zero, which failed the whole build with no recourse and forced the kernel to be excluded wholesale. Treat a non-zero remote result as a fallback trigger rather than a terminal error: recompile locally, which either succeeds (confirming a dist-only artifact) or reproduces the genuine error. A remote failure can no longer break a build that would compile locally. Only failing dist compiles are affected - successful ones are returned unchanged - so the kernel now distributes (1124/1128 compiles), with its handful of .incbin objects falling back to local. Signed-off-by: Javier Tia --- src/compiler/compiler.rs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/compiler/compiler.rs b/src/compiler/compiler.rs index 77eea183e..d3ceee25a 100644 --- a/src/compiler/compiler.rs +++ b/src/compiler/compiler.rs @@ -973,7 +973,7 @@ where ) })?; - let mut jc = match jres { + let jc = match jres { dist::RunJobResult::Complete(jc) => jc, dist::RunJobResult::JobNotFound => bail!("Job {} not found on server", job_id), }; @@ -1039,19 +1039,26 @@ where ); if jc.output.code != 0 { - // Add server info to help diagnose host-specific failures, e.g. due to flaky hardware. - // Failed builds are not cached so this tampering should not cause too much trouble. - let server_info = format!("sccache: Job failed on server {}:\n", server_id.addr()); - jc.output - .stderr - .splice(0..0, server_info.as_bytes().to_vec()); + // A non-zero remote result is frequently a distribution artifact + // rather than a genuine compiler error: e.g. an object that + // .incbin's a binary the inputs packager did not ship (the kernel's + // vdso/dtb/embedded-config wrappers), which the build-server cannot + // assemble. Fall back to a local recompile via the or_else below - it + // either succeeds (confirming a dist-only artifact) or reproduces the + // real error locally, so a remote failure never breaks a build that + // would compile fine locally. This only affects failing dist + // compiles; successful ones are returned unchanged above. + bail!( + "distributed compile on {} returned exit code {}; recompiling locally", + server_id.addr(), + jc.output.code + ); } info!( - "[{}]: Distributed compilation finished on {} (exit code {})", + "[{}]: Distributed compilation finished on {}", out_pretty, - server_id.addr(), - jc.output.code + server_id.addr() ); Ok((DistType::Ok(server_id), jc.output.into())) }; From d4f3c6f464c2f1dc5ff2eb829c4364cbe511d4ae Mon Sep 17 00:00:00 2001 From: Javier Tia Date: Wed, 24 Jun 2026 16:50:12 -0600 Subject: [PATCH 6/6] compiler: fall back to local when a dist compile drops an output A distributed compile can return a successful (exit 0) result yet omit a declared output. glibc's ldconfig.o and sprof.o compile fine on the build-server, but it does not return their `.o.dt` dependency file, so zipping the compiler outputs fails fatally ("failed to open file ...o.dt: No such file") with no recourse. One dropped output among 6427 forced glibc to be excluded from distribution wholesale, even though 6425 of its compiles distribute cleanly. Capture the declared output paths before the compilation is moved into the packagers, and after a successful remote compile verify each one exists on disk. A missing output is a distribution artifact, not a compiler error, so bail into the existing local-recompile fallback (the same one that already salvages non-zero remote results), which reproduces the full output set. Only compiles whose dist output set is incomplete are affected; complete ones are returned unchanged. glibc now distributes, with ldconfig.o and sprof.o falling back to local. Signed-off-by: Javier Tia --- src/compiler/compiler.rs | 132 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/src/compiler/compiler.rs b/src/compiler/compiler.rs index d3ceee25a..eb5ec5195 100644 --- a/src/compiler/compiler.rs +++ b/src/compiler/compiler.rs @@ -903,6 +903,14 @@ where .map(|output| path_transformer.as_dist_abs(&cwd.join(output.path))) .collect::>() .context("Failed to adapt an output path for distributed compile")?; + // The local paths every declared output must occupy once the compile + // finishes. Captured before `compilation` is moved into the packagers + // below so a remote compile that drops a declared output can be detected + // and salvaged (see the missing-output check after the run completes). + let expected_output_paths: Vec = compilation + .outputs() + .map(|output| cwd.join(output.path)) + .collect(); let (inputs_packager, toolchain_packager, outputs_rewriter) = compilation.into_dist_packagers(path_transformer)?; @@ -1055,6 +1063,21 @@ where ); } + // The remote compile returned exit 0 but may have dropped a declared + // output: glibc's ldconfig.o/sprof.o omit their `.o.dt` dep file, which + // the build-server does not return. Zipping the outputs later would then + // fail fatally with no recourse. Treat a missing declared output as a + // distribution artifact and fall back to a local recompile via the + // or_else below, which reproduces the full output set. Compiles whose + // dist output set is complete are unaffected. + if let Some(missing) = expected_output_paths.iter().find(|p| !p.exists()) { + bail!( + "distributed compile on {} did not return expected output {}; recompiling locally", + server_id.addr(), + missing.display() + ); + } + info!( "[{}]: Distributed compilation finished on {}", out_pretty, @@ -3225,6 +3248,7 @@ LLVM version: 6.0", test_dist::ErrorAllocJobClient::new(), test_dist::ErrorSubmitToolchainClient::new(), test_dist::ErrorRunJobClient::new(), + test_dist::IncompleteOutputsClient::new(), ]; // Write a dummy input file so the preprocessor cache mode can work std::fs::write(f.tempdir.path().join("foo.c"), "whatever").unwrap(); @@ -3693,4 +3717,112 @@ mod test_dist { None } } + + /// A dist client whose remote compile succeeds (exit 0) but whose returned + /// output set drops a declared output - the glibc `.o.dt` failure mode, + /// where the build-server runs the compile fine yet does not return every + /// declared output. The client must detect the missing output and fall back + /// to a local recompile rather than failing the build. + pub struct IncompleteOutputsClient { + has_started: AtomicBool, + tc: Toolchain, + output: ProcessOutput, + } + + impl IncompleteOutputsClient { + #[allow(clippy::new_ret_no_self)] + pub fn new() -> Arc { + Arc::new(Self { + has_started: AtomicBool::default(), + tc: Toolchain { + archive_id: "somearchiveid".to_owned(), + }, + output: ProcessOutput::fake_output(0, vec![], vec![]), + }) + } + } + + #[async_trait] + impl dist::Client for IncompleteOutputsClient { + async fn do_alloc_job(&self, tc: Toolchain) -> Result { + assert!( + !self + .has_started + .swap(true, std::sync::atomic::Ordering::AcqRel) + ); + assert_eq!(self.tc, tc); + + Ok(AllocJobResult::Success { + job_alloc: JobAlloc { + auth: "abcd".to_owned(), + job_id: JobId(0), + server_id: ServerId::new(([0, 0, 0, 0], 1).into()), + }, + need_toolchain: true, + }) + } + async fn do_get_status(&self) -> Result { + unreachable!("fn do_get_status is not used for this test. qed") + } + async fn do_submit_toolchain( + &self, + job_alloc: JobAlloc, + tc: Toolchain, + ) -> Result { + assert_eq!(job_alloc.job_id, JobId(0)); + assert_eq!(self.tc, tc); + + Ok(SubmitToolchainResult::Success) + } + async fn do_run_job( + &self, + job_alloc: JobAlloc, + command: CompileCommand, + outputs: Vec, + inputs_packager: Box, + ) -> Result<(RunJobResult, PathTransformer)> { + assert_eq!(job_alloc.job_id, JobId(0)); + assert_eq!(command.executable, "/overridden/compiler"); + + let mut inputs = vec![]; + let path_transformer = inputs_packager.write_inputs(&mut inputs).unwrap(); + // Drop one declared output to mimic a build-server that returned a + // successful compile with an incomplete output set. + let mut outputs = outputs; + outputs.pop(); + let outputs = outputs + .into_iter() + .map(|name| { + let data = format!("some data in {}", name); + let data = OutputData::try_from_reader(data.as_bytes()).unwrap(); + (name, data) + }) + .collect(); + let result = RunJobResult::Complete(JobComplete { + output: self.output.clone(), + outputs, + }); + Ok((result, path_transformer)) + } + async fn put_toolchain( + &self, + _: PathBuf, + _: String, + _: Box, + ) -> Result<(Toolchain, Option<(String, PathBuf)>)> { + Ok(( + self.tc.clone(), + Some(( + "/overridden/compiler".to_owned(), + PathBuf::from("somearchiveid"), + )), + )) + } + fn rewrite_includes_only(&self) -> bool { + false + } + fn get_custom_toolchain(&self, _exe: &Path) -> Option { + None + } + } }