@@ -71,45 +71,72 @@ pub struct CudaBuilder {
71
71
/// Whether to run libnvvm optimizations. This defaults to `false`
72
72
/// but will be set to `true` if release is specified.
73
73
pub nvvm_opts : bool ,
74
- /// The virtual compute architecture to target for PTX generation. This
75
- /// dictates how certain things are codegenned and may affect performance
76
- /// and/or which gpus the code can run on.
74
+ /// The virtual compute architecture to target for PTX generation. This dictates how
75
+ /// certain things are codegenned and may affect performance and/or which gpus the
76
+ /// code can run on.
77
77
///
78
- /// You should generally try to pick an arch that will work with most
79
- /// GPUs you want your program to work with. Make sure to also
80
- /// use an appropriate compute arch if you are using recent features
81
- /// such as tensor cores (which need at least 7.x).
78
+ /// You should generally try to pick an arch that will work with most GPUs you want
79
+ /// your program to work with. Make sure to also use an appropriate compute arch if
80
+ /// you are using recent features such as tensor cores (which need at least 7.x).
82
81
///
83
- /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
82
+ /// If you are unsure, either leave this option to default, or pick something around
83
+ /// 5.2 to 7.x.
84
84
///
85
- /// You can find a list of features supported on each arch and a list of GPUs for every
86
- /// arch [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
85
+ /// You can find a list of features supported on each arch and a list of GPUs for
86
+ /// every arch
87
+ /// [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
87
88
///
88
89
/// NOTE that this does not necessarily mean that code using a certain capability
89
- /// will not work on older capabilities. It means that if it uses certain
90
- /// features it may not work.
90
+ /// will not work on older capabilities. It means that if it uses certain features
91
+ /// it may not work.
92
+ ///
93
+ /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the
94
+ /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell
95
+ /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x`
96
+ /// contains support for things like f64 atomic add and half precision float ops.
91
97
///
92
- /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as
93
- /// the GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because
94
- /// Maxwell (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover,
95
- /// `6.x` contains support for things like f64 atomic add and half precision float ops.
98
+ /// Starting with CUDA 12.9, architectures can have suffixes:
96
99
///
97
- /// ## Target Features for Conditional Compilation
100
+ /// - **No suffix** (e.g., `Compute70`): Forward-compatible across all future GPUs.
101
+ /// Best for general compatibility.
102
+ /// - **'f' suffix** (e.g., `Compute100f`): Family-specific features,
103
+ /// forward-compatible within same major version (10.0, 10.3, etc.) but NOT across
104
+ /// major versions.
105
+ /// - **'a' suffix** (e.g., `Compute100a`): Architecture-specific features (mainly
106
+ /// Tensor Cores). Code ONLY runs on that exact compute capability, no
107
+ /// compatibility with any other GPU.
98
108
///
99
- /// The chosen architecture enables a target feature that can be used for
100
- /// conditional compilation with `#[cfg(target_feature = "compute_XX")]`.
101
- /// This feature means "at least this capability", matching NVIDIA's semantics.
109
+ /// Most applications should use base architectures (no suffix). Only use 'f' or 'a'
110
+ /// if you need specific features and understand the compatibility trade-offs.
102
111
///
103
- /// For other patterns (exact ranges, maximum capabilities), use boolean `cfg` logic.
104
- /// See the compute capabilities guide for examples.
112
+ /// The chosen architecture enables target features for conditional compilation:
113
+ /// - Base arch: `#[cfg(target_feature = "compute_70")]` - enabled on 7.0+
114
+ /// - Family variant: `#[cfg(target_feature = "compute_100f")]` - enabled on 10.x family
115
+ /// with same or higher minor version
116
+ /// - Arch variant: `#[cfg(target_feature = "compute_100a")]` - enabled when building for
117
+ /// exactly 10.0 (includes all base and family features during compilation)
105
118
///
106
119
/// For example, with `.arch(NvvmArch::Compute61)`:
107
120
/// ```ignore
108
121
/// #[cfg(target_feature = "compute_61")]
109
122
/// {
110
- /// // Code that requires compute capability 6.1+
123
+ /// // Code that requires compute capability 6.1+ will be emitted because it matches
124
+ /// // the target architecture.
125
+ /// }
126
+ /// #[cfg(target_feature = "compute_51")]
127
+ /// {
128
+ /// // Code that requires compute capability 5.1 will be emitted
129
+ /// // because 6.1 is a superset of 5.1.
130
+ /// }
131
+ /// #[cfg(target_feature = "compute_71")]
132
+ /// {
133
+ /// // Code that requires compute capability 7.1 will NOT be emitted
134
+ /// // because the chosen arch (6.1) is not a superset of 7.1.
111
135
/// }
112
136
/// ```
137
+ ///
138
+ /// See:
139
+ /// <https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/>
113
140
pub arch : NvvmArch ,
114
141
/// Flush denormal values to zero when performing single-precision floating point operations.
115
142
/// `false` by default.
@@ -234,9 +261,7 @@ impl CudaBuilder {
234
261
/// and/or which gpus the code can run on.
235
262
///
236
263
/// You should generally try to pick an arch that will work with most
237
- /// GPUs you want your program to work with. Make sure to also
238
- /// use an appropriate compute arch if you are using recent features
239
- /// such as tensor cores (which need at least 7.x).
264
+ /// GPUs you want your program to work with.
240
265
///
241
266
/// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
242
267
///
@@ -247,8 +272,6 @@ impl CudaBuilder {
247
272
/// will not work on older capabilities. It means that if it uses certain
248
273
/// features it may not work.
249
274
///
250
- /// ## Target Features for Conditional Compilation
251
- ///
252
275
/// The chosen architecture enables target features for conditional compilation.
253
276
/// See the documentation on the `arch` field for more details.
254
277
pub fn arch ( mut self , arch : NvvmArch ) -> Self {
0 commit comments