@@ -549,6 +549,38 @@ def fn(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor, *, _launcher=_de
549549 # src[test_loops.py:N]: return out
550550 return out
551551
552+ --- assertExpectedJournal(TestLoops.test_flattened_tile_with_unit_axis)
553+ from __future__ import annotations
554+
555+ import torch
556+ import triton
557+ import triton.language as tl
558+ from helion.runtime import default_launcher as _default_launcher
559+
560+ @triton.jit
561+ def _helion_silu_kernel(x, out, _BLOCK_SIZE_0_1: tl.constexpr):
562+ # src[test_loops.py:N]: for tile in hl.tile(out.size()):
563+ offsets_0_1 = tl.program_id(0) * _BLOCK_SIZE_0_1 + tl.arange(0, _BLOCK_SIZE_0_1).to(tl.int32)
564+ indices_1 = offsets_0_1
565+ mask_0_1 = offsets_0_1 < 100
566+ # src[test_loops.py:N]: out[tile] = x[tile] * torch.sigmoid(x[tile])
567+ load = tl.load(x + indices_1[None, :] * 1, mask_0_1[None, :], other=0, eviction_policy='evict_first')
568+ load_1 = tl.load(x + indices_1[None, :] * 1, mask_0_1[None, :], other=0)
569+ v_0 = tl.cast(tl.sigmoid(tl.cast(load_1, tl.float32)), tl.float16)
570+ v_1 = load * v_0
571+ tl.store(out + indices_1[None, :] * 1, v_1, mask_0_1[None, :])
572+
573+ def silu_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
574+ # src[test_loops.py:N]: out = torch.empty_like(x, dtype=x.dtype, device=x.device)
575+ out = torch.empty_like(x, dtype=x.dtype, device=x.device)
576+ # src[test_loops.py:N]: for tile in hl.tile(out.size()):
577+ _BLOCK_SIZE_0_1 = 128
578+ # src[test_loops.py:N]: for tile in hl.tile(out.size()):
579+ # src[test_loops.py:N]: out[tile] = x[tile] * torch.sigmoid(x[tile])
580+ _launcher(_helion_silu_kernel, (triton.cdiv(100, _BLOCK_SIZE_0_1), 1, 1), x, out, _BLOCK_SIZE_0_1, num_warps=32, num_stages=8)
581+ # src[test_loops.py:N]: return out
582+ return out
583+
552584--- assertExpectedJournal(TestLoops.test_full_with_dynamic_fill_value)
553585from __future__ import annotations
554586
0 commit comments