apache · tqchen · Feb 7, 2026 · Feb 7, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -298,6 +298,7 @@ tvm_file_glob(GLOB_RECURSE COMPILER_SRCS
     src/te/*.cc
     src/autotvm/*.cc
     src/tir/*.cc
+    src/s_tir/*.cc
     src/topi/*.cc
     src/driver/*.cc
     src/support/*.cc

diff --git a/include/tvm/s_tir/transform.h b/include/tvm/s_tir/transform.h
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/s_tir/transform.h
+ * \brief S-TIR specific transformation passes.
+ */
+#ifndef TVM_S_TIR_TRANSFORM_H_
+#define TVM_S_TIR_TRANSFORM_H_
+
+#include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/transform.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace s_tir {
+namespace transform {
+
+using tir::transform::CreatePrimFuncPass;
+using tvm::transform::Pass;
+using tvm::transform::PassContext;
+
+/*!
+ * \brief Canonicalize loop to start from zero .
+ * \return The pass.
+ */
+TVM_DLL Pass CanonicalizeLoop();
+
+/*!
+ * \brief Lower cross-thread reduction from thread
+ * bindings to intrinsic function calls.
+ * \return The pass.
+ */
+TVM_DLL Pass LowerCrossThreadReduction();
+
+/*!
+ * \brief Lower block init stmt into IfThenElse stmts
+ * \return The pass.
+ */
+TVM_DLL Pass LowerInitBlock();
+
+/*!
+ * \brief Locate the buffer allocation to the exact position (usually is
+ *        the lca of buffer access). This pass will inject opaque block
+ *        with alloc_buffers at the allocation site.
+ * \return The pass.
+ */
+TVM_DLL Pass PlanAndUpdateBufferAllocationLocation();
+
+/*!
+ * \brief Substitute all the block vars with the PrimExprs they are bound to, indicated by the
+ *        corresponding iter_values in BlockRealize, for opaque blocks by removing all
+ *.        the iter_values in BlockRealize and iter_vars in Block.
+ * \return The pass.
+ */
+TVM_DLL Pass ConvertBlocksToOpaque();
+
+/*!
+ * \brief Lift the same thread bindings to their LCA loops
+ * \return The pass.
+ */
+TVM_DLL Pass LiftThreadBinding();
+
+/*!
+ * \brief Compact the buffer access region by removing the buffer regions that are not accessed,
+ *        i.e. narrowing the buffer shape and adjust the access region if necessary.
+ *
+ * Before narrowing, `B` is a `[16, 16]` buffer, but only a skinny vector `B[i, 0:16]` is accessed.
+ *
+ *  \code
+ *
+ *  for i in range(0, 16):
+ *      with T.sblock():
+ *          B = T.alloc_buffer(16, 16)
+ *          for j in range(0, 16):
+ *              B[i, j] = A[i, j] + 1
+ *          for j in range(0, 16):
+ *              C[i, j] = B[i, j] + 1
+ *
+ *  \endcode
+ *
+ * This pass narrows the buffer shape and adjust its accessed region accordingly.
+ * In this particular case, because only a `1 * 16` vector of `B` is accessed,
+ * the pass narrows `B` to shape `[1, 16]`, and changes the access to `B[i, j]` to `B[0, j]`.
+ *
+ *  \code
+ *
+ *  for i in range(0, 16):
+ *      with T.sblock():
+ *          B = T.alloc_buffer(1, 16)
+ *          for j in range(0, 16):
+ *              B[0, j] = A[i, j] + 1
+ *          for j in range(0, 16):
+ *              C[i, j] = B[0, j] + 1
+ *
+ *  \endcode
+ *
+ * \param is_strict ensure the compacted shape always smaller than the original shape.
+ *   otherwise it allows to grow the shape to match actual accessed buffer regions.
+ * \return The pass.
+ */
+TVM_DLL Pass CompactBufferAllocation(bool is_strict = true);
+
+/*!
+ * \brief Remove match buffers inside the block. Also, it will validate the binding.
+ * \return The pass.
+ */
+TVM_DLL Pass LowerMatchBuffer();
+
+/*!
+ * \brief Inject permuted layout for shared memory.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectPermutedLayout();
+
+/*!
+ * \brief Transform Mma scope (m16n8k8.matrixA/B/C) to local scope with layout transformation.
+ * \return The pass.
+ */
+TVM_DLL Pass TransformMmaBufferLayout();
+
+/*!
+ * \brief Remove the block to ensure that the TIR can not be scheduled again.
+ * \return The pass.
+ */
+TVM_DLL Pass LowerOpaqueBlock();
+
+/*!
+ * \brief Unify all the thread bindings for "blockIdx.x/y/z", "threadIdx.x/y/z", and
+ *        "vthread.x/y/z". Before the unification, two vars that are bound to a thread axis (e.g.,
+ *        "threadIdx.x") use different IterVars and variables in their AttrStmts. After the
+ *        unification, we use a consolidated IterVar and a variable for them.
+ * \return The pass.
+ * \note `vthread` is a legacy behavior that will be deprecated, though thread bindings of `vthread`
+ *       are still also unified in this pass. Please use `vthread.x`, `vthread.y` and `vthread.z`
+ *       instead.
+ */
+TVM_DLL Pass UnifyThreadBinding();
+
+/*!
+ * \brief This pass transforms annotated loops into pipelined ones where producers and consumers
+ * are overlapped with the information provided in loop annotations, which enables optimization
+ * techniques like prefetching and pipeline parallelism.
+ *
+ * The pipeline scope consists of the direct children of the annotated loop (ignoring SBlockRealize,
+ * SBlock, SeqStmt), and the number of children is denoted by `n` in the documentation.
+ *
+ * The following annotations are used to guide the loop transformation:
+ *
+ * 1) Loop annotation `software_pipeline_stage` defines the pipeline stage.
+ * An array of `n` integers, and each element should be in range [0, max_stage],
+ * where max_stage is the maximum (inclusive) stage.
+ * 2) Loop annotation `software_pipeline_order` defines the pipeline order.
+ * An array of `n` integers, a permutation of [0, 1, ..., num_components - 1];
+ * 3) SBlock annotation `double_buffer_scope` controls certain buffer sizes to allow decoupling of
+ * read/write dependency. It's an integer index of the write regions of the block.
+ *
+ * Every annotated loop is transformed into a loop with three blocks as its direct children:
+ *
+ * 1) Prologue block, where components whose stage is less than `max_stage` is executed;
+ *
+ * 2) Body block, where all the components are executed;
+ *
+ * 3) Epilogue block, where only components whose stage is greater than 0 will be executed.
+ * The execution order is controlled by the annotation `software_pipeline_order`,
+ * and thus could be different than the original order.
+ *
+ * Note: For nested software pipelines, the inner software pipeline will be generated first,
+ * which may affect the number of the direct children of the outer loop.
+ * In this case, the annotations for the outer software
+ * pipeline should include the result of the inner software pipeline,
+ * which is the three blocks as discussed above.
+ *
+ * \return The IR transform pass.
+ */
+TVM_DLL Pass InjectSoftwarePipeline();
+
+/*!
+ * \brief Automatically do memory optimizations for auto copy blocks
+ * \return The pass.
+ */
+TVM_DLL Pass LowerAutoCopy();
+
+/*!
+ * \brief Add the explicit local stage for the shared memory access on GPU.
+ * \return The pass.
+ */
+TVM_DLL Pass ManifestSharedMemoryLocalStage();
+
+/*! \brief Annotate irregular loop mark. */
+TVM_DLL Pass AnnotateIrregularLoop();
+
+/*!
+ * \brief partition loops in the stmt.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass LoopPartition();
+
+/*!
+ * \brief Inject virtual thread loops.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass InjectVirtualThread();
+
+/*!
+ * \brief Inject double buffer statements.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass InjectDoubleBuffer();
+
+}  // namespace transform
+}  // namespace s_tir
+}  // namespace tvm
+
+#endif  // TVM_S_TIR_TRANSFORM_H_