From 027135976849da34250d7638502bcc2a24f741d7 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 5 Sep 2025 09:32:56 +0000 Subject: [PATCH 1/3] Make the allocator shim participate in LTO again --- compiler/rustc_codegen_ssa/src/back/lto.rs | 14 ------- compiler/rustc_codegen_ssa/src/back/write.rs | 40 +++++++++++++++----- compiler/rustc_codegen_ssa/src/base.rs | 11 +++++- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/compiler/rustc_codegen_ssa/src/back/lto.rs b/compiler/rustc_codegen_ssa/src/back/lto.rs index f4a9037940a67..e6df6a2469f37 100644 --- a/compiler/rustc_codegen_ssa/src/back/lto.rs +++ b/compiler/rustc_codegen_ssa/src/back/lto.rs @@ -1,7 +1,6 @@ use std::ffi::CString; use std::sync::Arc; -use rustc_ast::expand::allocator::AllocatorKind; use rustc_data_structures::memmap::Mmap; use rustc_hir::def_id::{CrateNum, LOCAL_CRATE}; use rustc_middle::middle::exported_symbols::{ExportedSymbol, SymbolExportInfo, SymbolExportLevel}; @@ -96,19 +95,6 @@ pub(super) fn exported_symbols_for_lto( .filter_map(|&(s, info): &(ExportedSymbol<'_>, SymbolExportInfo)| { if info.level.is_below_threshold(export_threshold) || info.used { Some(symbol_name_for_instance_in_crate(tcx, s, cnum)) - } else if export_threshold == SymbolExportLevel::C - && info.rustc_std_internal_symbol - && let Some(AllocatorKind::Default) = allocator_kind_for_codegen(tcx) - { - // Export the __rdl_* exports for usage by the allocator shim when not using - // #[global_allocator]. Most of the conditions above are only used to avoid - // unnecessary expensive symbol_name_for_instance_in_crate calls. - let sym = symbol_name_for_instance_in_crate(tcx, s, cnum); - if sym.contains("__rdl_") || sym.contains("__rg_oom") { - Some(sym) - } else { - None - } } else { None } diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs index f637e7f58dbf7..afda7226fdd12 100644 --- a/compiler/rustc_codegen_ssa/src/back/write.rs +++ b/compiler/rustc_codegen_ssa/src/back/write.rs @@ -334,6 +334,7 @@ pub struct CodegenContext { pub output_filenames: Arc, pub invocation_temp: Option, pub module_config: Arc, + pub allocator_config: Arc, pub tm_factory: TargetMachineFactoryFn, pub msvc_imps_needed: bool, pub is_pe_coff: bool, @@ -794,12 +795,19 @@ pub(crate) fn compute_per_cgu_lto_type( sess_lto: &Lto, opts: &config::Options, sess_crate_types: &[CrateType], + module_kind: ModuleKind, ) -> ComputedLtoType { // If the linker does LTO, we don't have to do it. Note that we // keep doing full LTO, if it is requested, as not to break the // assumption that the output will be a single module. let linker_does_lto = opts.cg.linker_plugin_lto.enabled(); + // When we're automatically doing ThinLTO for multi-codegen-unit + // builds we don't actually want to LTO the allocator module if + // it shows up. This is due to various linker shenanigans that + // we'll encounter later. + let is_allocator = module_kind == ModuleKind::Allocator; + // We ignore a request for full crate graph LTO if the crate type // is only an rlib, as there is no full crate graph to process, // that'll happen later. @@ -811,7 +819,7 @@ pub(crate) fn compute_per_cgu_lto_type( let is_rlib = matches!(sess_crate_types, [CrateType::Rlib]); match sess_lto { - Lto::ThinLocal if !linker_does_lto => ComputedLtoType::Thin, + Lto::ThinLocal if !linker_does_lto && !is_allocator => ComputedLtoType::Thin, Lto::Thin if !linker_does_lto && !is_rlib => ComputedLtoType::Thin, Lto::Fat if !is_rlib => ComputedLtoType::Fat, _ => ComputedLtoType::No, @@ -825,18 +833,23 @@ fn execute_optimize_work_item( let dcx = cgcx.create_dcx(); let dcx = dcx.handle(); - B::optimize(cgcx, dcx, &mut module, &cgcx.module_config); + let module_config = match module.kind { + ModuleKind::Regular => &cgcx.module_config, + ModuleKind::Allocator => &cgcx.allocator_config, + }; + + B::optimize(cgcx, dcx, &mut module, module_config); // After we've done the initial round of optimizations we need to // decide whether to synchronously codegen this module or ship it // back to the coordinator thread for further LTO processing (which // has to wait for all the initial modules to be optimized). - let lto_type = compute_per_cgu_lto_type(&cgcx.lto, &cgcx.opts, &cgcx.crate_types); + let lto_type = compute_per_cgu_lto_type(&cgcx.lto, &cgcx.opts, &cgcx.crate_types, module.kind); // If we're doing some form of incremental LTO then we need to be sure to // save our module to disk first. - let bitcode = if cgcx.module_config.emit_pre_lto_bc { + let bitcode = if module_config.emit_pre_lto_bc { let filename = pre_lto_bitcode_filename(&module.name); cgcx.incr_comp_session_dir.as_ref().map(|path| path.join(&filename)) } else { @@ -845,7 +858,7 @@ fn execute_optimize_work_item( match lto_type { ComputedLtoType::No => { - let module = B::codegen(cgcx, module, &cgcx.module_config); + let module = B::codegen(cgcx, module, module_config); WorkItemResult::Finished(module) } ComputedLtoType::Thin => { @@ -1133,6 +1146,7 @@ fn start_executing_work( diag_emitter: shared_emitter.clone(), output_filenames: Arc::clone(tcx.output_filenames(())), module_config: regular_config, + allocator_config, tm_factory: backend.target_machine_factory(tcx.sess, ol, backend_features), msvc_imps_needed: msvc_imps_needed(tcx), is_pe_coff: tcx.sess.target.is_like_windows, @@ -1147,11 +1161,6 @@ fn start_executing_work( invocation_temp: sess.invocation_temp.clone(), }; - let compiled_allocator_module = allocator_module.map(|mut allocator_module| { - B::optimize(&cgcx, tcx.sess.dcx(), &mut allocator_module, &allocator_config); - B::codegen(&cgcx, allocator_module, &allocator_config) - }); - // This is the "main loop" of parallel work happening for parallel codegen. // It's here that we manage parallelism, schedule work, and work with // messages coming from clients. @@ -1331,6 +1340,17 @@ fn start_executing_work( let mut llvm_start_time: Option> = None; + let compiled_allocator_module = allocator_module.and_then(|allocator_module| { + match execute_optimize_work_item(&cgcx, allocator_module) { + WorkItemResult::Finished(compiled_module) => return Some(compiled_module), + WorkItemResult::NeedsFatLto(fat_lto_input) => needs_fat_lto.push(fat_lto_input), + WorkItemResult::NeedsThinLto(name, thin_buffer) => { + needs_thin_lto.push((name, thin_buffer)) + } + } + None + }); + // Run the message loop while there's still anything that needs message // processing. Note that as soon as codegen is aborted we simply want to // wait for all existing work to finish, so many of the conditions here diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index a9a2ae1b3dbca..45b028aa8eff2 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -46,7 +46,9 @@ use crate::meth::load_vtable; use crate::mir::operand::OperandValue; use crate::mir::place::PlaceRef; use crate::traits::*; -use crate::{CachedModuleCodegen, CodegenLintLevels, CrateInfo, ModuleCodegen, errors, meth, mir}; +use crate::{ + CachedModuleCodegen, CodegenLintLevels, CrateInfo, ModuleCodegen, ModuleKind, errors, meth, mir, +}; pub(crate) fn bin_op_to_icmp_predicate(op: BinOp, signed: bool) -> IntPredicate { match (op, signed) { @@ -1124,7 +1126,12 @@ pub fn determine_cgu_reuse<'tcx>(tcx: TyCtxt<'tcx>, cgu: &CodegenUnit<'tcx>) -> // We can re-use either the pre- or the post-thinlto state. If no LTO is // being performed then we can use post-LTO artifacts, otherwise we must // reuse pre-LTO artifacts - match compute_per_cgu_lto_type(&tcx.sess.lto(), &tcx.sess.opts, tcx.crate_types()) { + match compute_per_cgu_lto_type( + &tcx.sess.lto(), + &tcx.sess.opts, + tcx.crate_types(), + ModuleKind::Regular, + ) { ComputedLtoType::No => CguReuse::PostLto, _ => CguReuse::PreLto, } From 9239d141dc4e9435b036d34cf8b2cb1e37b9f454 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 5 Sep 2025 14:46:46 +0000 Subject: [PATCH 2/3] Add test that __rg_oom doesn't get internalized during LTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Rémy Rakic --- tests/ui/lto/lto-global-allocator.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/ui/lto/lto-global-allocator.rs diff --git a/tests/ui/lto/lto-global-allocator.rs b/tests/ui/lto/lto-global-allocator.rs new file mode 100644 index 0000000000000..03f11709c901a --- /dev/null +++ b/tests/ui/lto/lto-global-allocator.rs @@ -0,0 +1,19 @@ +//@ compile-flags: --crate-type cdylib -C lto +//@ build-pass +//@ no-prefer-dynamic +//@ needs-crate-type: cdylib + +use std::alloc::{GlobalAlloc, Layout}; + +struct MyAllocator; + +unsafe impl GlobalAlloc for MyAllocator { + unsafe fn alloc(&self, _layout: Layout) -> *mut u8 { + todo!() + } + + unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {} +} + +#[global_allocator] +static GLOBAL: MyAllocator = MyAllocator; From 2cf94b92ca852924ad90943a0c469f01742216a6 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 5 Sep 2025 19:09:39 +0000 Subject: [PATCH 3/3] Ensure fat LTO doesn't merge everything into the allocator module --- compiler/rustc_codegen_cranelift/src/driver/aot.rs | 11 ++++++++++- compiler/rustc_codegen_llvm/src/back/lto.rs | 8 +++++++- compiler/rustc_codegen_ssa/src/back/write.rs | 3 ++- compiler/rustc_codegen_ssa/src/lib.rs | 2 ++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_codegen_cranelift/src/driver/aot.rs b/compiler/rustc_codegen_cranelift/src/driver/aot.rs index c3adb5e767e25..7e77781dc2fc1 100644 --- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs +++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs @@ -12,7 +12,9 @@ use cranelift_object::{ObjectBuilder, ObjectModule}; use rustc_codegen_ssa::assert_module_sources::CguReuse; use rustc_codegen_ssa::back::link::ensure_removed; use rustc_codegen_ssa::base::determine_cgu_reuse; -use rustc_codegen_ssa::{CodegenResults, CompiledModule, CrateInfo, errors as ssa_errors}; +use rustc_codegen_ssa::{ + CodegenResults, CompiledModule, CrateInfo, ModuleKind, errors as ssa_errors, +}; use rustc_data_structures::profiling::SelfProfilerRef; use rustc_data_structures::stable_hasher::{HashStable, StableHasher}; use rustc_data_structures::sync::{IntoDynSyncSend, par_map}; @@ -361,6 +363,7 @@ fn emit_cgu( invocation_temp, prof, product.object, + ModuleKind::Regular, name.clone(), producer, )?; @@ -369,6 +372,7 @@ fn emit_cgu( module_regular, module_global_asm: global_asm_object_file.map(|global_asm_object_file| CompiledModule { name: format!("{name}.asm"), + kind: ModuleKind::Regular, object: Some(global_asm_object_file), dwarf_object: None, bytecode: None, @@ -385,6 +389,7 @@ fn emit_module( invocation_temp: Option<&str>, prof: &SelfProfilerRef, mut object: cranelift_object::object::write::Object<'_>, + kind: ModuleKind, name: String, producer_str: &str, ) -> Result { @@ -425,6 +430,7 @@ fn emit_module( Ok(CompiledModule { name, + kind, object: Some(tmp_file), dwarf_object: None, bytecode: None, @@ -479,6 +485,7 @@ fn reuse_workproduct_for_cgu( Ok(ModuleCodegenResult { module_regular: CompiledModule { name: cgu.name().to_string(), + kind: ModuleKind::Regular, object: Some(obj_out_regular), dwarf_object: None, bytecode: None, @@ -488,6 +495,7 @@ fn reuse_workproduct_for_cgu( }, module_global_asm: source_file_global_asm.map(|source_file| CompiledModule { name: cgu.name().to_string(), + kind: ModuleKind::Regular, object: Some(obj_out_global_asm), dwarf_object: None, bytecode: None, @@ -643,6 +651,7 @@ fn emit_allocator_module(tcx: TyCtxt<'_>) -> Option { tcx.sess.invocation_temp.as_deref(), &tcx.sess.prof, product.object, + ModuleKind::Allocator, "allocator_shim".to_owned(), &crate::debuginfo::producer(tcx.sess), ) { diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs index 326b876e7e689..ad2e722cfef62 100644 --- a/compiler/rustc_codegen_llvm/src/back/lto.rs +++ b/compiler/rustc_codegen_llvm/src/back/lto.rs @@ -11,7 +11,7 @@ use object::{Object, ObjectSection}; use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared}; use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput}; use rustc_codegen_ssa::traits::*; -use rustc_codegen_ssa::{ModuleCodegen, looks_like_rust_object_file}; +use rustc_codegen_ssa::{ModuleCodegen, ModuleKind, looks_like_rust_object_file}; use rustc_data_structures::fx::FxHashMap; use rustc_data_structures::memmap::Mmap; use rustc_errors::DiagCtxtHandle; @@ -225,9 +225,15 @@ fn fat_lto( // All the other modules will be serialized and reparsed into the new // context, so this hopefully avoids serializing and parsing the largest // codegen unit. + // + // Additionally use a regular module as the base here to ensure that various + // file copy operations in the backend work correctly. The only other kind + // of module here should be an allocator one, and if your crate is smaller + // than the allocator module then the size doesn't really matter anyway. let costliest_module = in_memory .iter() .enumerate() + .filter(|&(_, module)| module.kind == ModuleKind::Regular) .map(|(i, module)| { let cost = unsafe { llvm::LLVMRustModuleCost(module.module_llvm.llmod()) }; (cost, i) diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs index afda7226fdd12..95e02a7c6dbc8 100644 --- a/compiler/rustc_codegen_ssa/src/back/write.rs +++ b/compiler/rustc_codegen_ssa/src/back/write.rs @@ -490,7 +490,7 @@ fn copy_all_cgu_workproducts_to_incr_comp_cache_dir( let _timer = sess.timer("copy_all_cgu_workproducts_to_incr_comp_cache_dir"); - for module in &compiled_modules.modules { + for module in compiled_modules.modules.iter().filter(|m| m.kind == ModuleKind::Regular) { let mut files = Vec::new(); if let Some(object_file_path) = &module.object { files.push((OutputType::Object.extension(), object_file_path.as_path())); @@ -960,6 +960,7 @@ fn execute_copy_from_cache_work_item( WorkItemResult::Finished(CompiledModule { links_from_incr_cache, + kind: ModuleKind::Regular, name: module.name, object, dwarf_object, diff --git a/compiler/rustc_codegen_ssa/src/lib.rs b/compiler/rustc_codegen_ssa/src/lib.rs index 23146661f27c0..baba8f9ca3e8b 100644 --- a/compiler/rustc_codegen_ssa/src/lib.rs +++ b/compiler/rustc_codegen_ssa/src/lib.rs @@ -120,6 +120,7 @@ impl ModuleCodegen { CompiledModule { name: self.name, + kind: self.kind, object, dwarf_object, bytecode, @@ -133,6 +134,7 @@ impl ModuleCodegen { #[derive(Debug, Encodable, Decodable)] pub struct CompiledModule { pub name: String, + pub kind: ModuleKind, pub object: Option, pub dwarf_object: Option, pub bytecode: Option,