diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py index 73406f4e7..43430dc2c 100644 --- a/helion/_compiler/tile_strategy.py +++ b/helion/_compiler/tile_strategy.py @@ -136,23 +136,21 @@ def get_tl_range_kwargs(config: Config, block_idx: int) -> list[str]: range_unroll_factor = env.config_spec.range_unroll_factors.config_get( config.range_unroll_factors, block_idx, 0 ) - if range_unroll_factor > 0: - kwargs.append(f"loop_unroll_factor={range_unroll_factor}") - range_warp_specialize = env.config_spec.range_warp_specialize.config_get( config.range_warp_specializes, block_idx, None ) - if range_warp_specialize is not None: - kwargs.append(f"warp_specialize={range_warp_specialize}") - range_num_stages = env.config_spec.range_num_stages.config_get( config.range_num_stages, block_idx, 0 ) + num_stages = config.num_stages - if config.indexing == "tensor_descriptor" and range_num_stages > 0: - # Tensor descriptor + multi-stage tl.range pipelines tend to cause + if config.indexing == "tensor_descriptor": + # Tensor descriptor + multi-stage pipelines in addition to unrolling tend to cause # CUDA "misaligned address" or "unspecified launch failure" errors. - range_num_stages = 0 + if range_num_stages > 0: + range_num_stages = 0 + if range_unroll_factor > 0 and num_stages > 1: + range_unroll_factor = 0 elif ( range_num_stages > 1 and range_unroll_factor > 1 @@ -170,6 +168,10 @@ def get_tl_range_kwargs(config: Config, block_idx: int) -> list[str]: max(1, int(math.ceil(remainder / step))), range_num_stages ) + if range_unroll_factor > 0: + kwargs.append(f"loop_unroll_factor={range_unroll_factor}") + if range_warp_specialize is not None: + kwargs.append(f"warp_specialize={range_warp_specialize}") if range_num_stages > 0: kwargs.append(f"num_stages={range_num_stages}")