@@ -703,10 +703,9 @@ pub(crate) unsafe fn llvm_optimize(
703703 llvm:: set_value_name ( new_fn, & name) ;
704704 }
705705
706- if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Enable ) {
706+ if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Device ) {
707707 let cx =
708708 SimpleCx :: new ( module. module_llvm . llmod ( ) , module. module_llvm . llcx , cgcx. pointer_size ) ;
709-
710709 for func in cx. get_functions ( ) {
711710 let offload_kernel = "offload-kernel" ;
712711 if attributes:: has_string_attr ( func, offload_kernel) {
@@ -775,12 +774,77 @@ pub(crate) unsafe fn llvm_optimize(
775774 )
776775 } ;
777776
778- if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Enable ) {
777+ if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Device ) {
778+ let device_path = cgcx. output_filenames . path ( OutputType :: Object ) ;
779+ let device_dir = device_path. parent ( ) . unwrap ( ) ;
780+ let device_out = device_dir. join ( "host.out" ) ;
781+ let device_out_c = path_to_c_string ( device_out. as_path ( ) ) ;
779782 unsafe {
780- llvm:: LLVMRustBundleImages ( module. module_llvm . llmod ( ) , module. module_llvm . tm . raw ( ) ) ;
783+ // 1) Bundle device module into offload image host.out (device TM)
784+ let ok = llvm:: LLVMRustBundleImages (
785+ module. module_llvm . llmod ( ) ,
786+ module. module_llvm . tm . raw ( ) ,
787+ device_out_c. as_ptr ( ) ,
788+ ) ;
789+ if !ok || !device_out. exists ( ) {
790+ dcx. emit_err ( crate :: errors:: OffloadBundleImagesFailed ) ;
791+ }
781792 }
782793 }
783794
795+ // This assumes that we previously compiled our kernels for a gpu target, which created a
796+ // `host.out` artifact. The user is supposed to provide us with a path to this artifact, we
797+ // don't need any other artifacts from the previous run. We will embed this artifact into our
798+ // LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk.
799+ // The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`.
800+ if !cgcx. target_is_like_gpu {
801+ if let Some ( device_path) = config
802+ . offload
803+ . iter ( )
804+ . find_map ( |o| if let config:: Offload :: Host ( path) = o { Some ( path) } else { None } )
805+ {
806+ let device_pathbuf = PathBuf :: from ( device_path) ;
807+ if device_pathbuf. is_relative ( ) {
808+ dcx. emit_err ( crate :: errors:: OffloadWithoutAbsPath ) ;
809+ } else if device_pathbuf
810+ . file_name ( )
811+ . and_then ( |n| n. to_str ( ) )
812+ . is_some_and ( |n| n != "host.out" )
813+ {
814+ dcx. emit_err ( crate :: errors:: OffloadWrongFileName ) ;
815+ } else if !device_pathbuf. exists ( ) {
816+ dcx. emit_err ( crate :: errors:: OffloadNonexistingPath ) ;
817+ }
818+ let host_path = cgcx. output_filenames . path ( OutputType :: Object ) ;
819+ let host_dir = host_path. parent ( ) . unwrap ( ) ;
820+ let out_obj = host_dir. join ( "host.o" ) ;
821+ let host_out_c = path_to_c_string ( device_pathbuf. as_path ( ) ) ;
822+
823+ // 2) Finalize host: lib.bc + host.out -> host.o (host TM)
824+ // We create a full clone of our LLVM host module, since we will embed the device IR
825+ // into it, and this might break caching or incremental compilation otherwise.
826+ let llmod2 = llvm:: LLVMCloneModule ( module. module_llvm . llmod ( ) ) ;
827+ let ok =
828+ unsafe { llvm:: LLVMRustOffloadEmbedBufferInModule ( llmod2, host_out_c. as_ptr ( ) ) } ;
829+ if !ok {
830+ dcx. emit_err ( crate :: errors:: OffloadEmbedFailed ) ;
831+ }
832+ write_output_file (
833+ dcx,
834+ module. module_llvm . tm . raw ( ) ,
835+ config. no_builtins ,
836+ llmod2,
837+ & out_obj,
838+ None ,
839+ llvm:: FileType :: ObjectFile ,
840+ & cgcx. prof ,
841+ true ,
842+ ) ;
843+ // We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact.
844+ // Otherwise, recompiling the host code would fail since we deleted that device artifact
845+ // in the previous host compilation, which would be confusing at best.
846+ }
847+ }
784848 result. into_result ( ) . unwrap_or_else ( |( ) | llvm_err ( dcx, LlvmError :: RunLlvmPasses ) )
785849}
786850
0 commit comments