diff --git a/llama-tornado b/llama-tornado index e09070b9..7791995c 100755 --- a/llama-tornado +++ b/llama-tornado @@ -73,6 +73,9 @@ class LlamaRunner: if args.use_gpu: cmd.append("-Duse.tornadovm=true") + if args.verbose_init: + cmd.append("-Dllama.EnableTimingForTornadoVMInit=true") + # Debug options debug_config = [] @@ -289,6 +292,9 @@ def create_parser() -> argparse.ArgumentParser: help="Print kernel information (tornado.printKernel=true)") verbose_group.add_argument("--full-dump", dest="full_dump", action="store_true", help="Enable full debug dump (tornado.fullDebug=true)") + verbose_group.add_argument("--verbose-init", dest="verbose_init", action="store_true", + help="Enable timers for TornadoVM initialization (llama.EnableTimingForTornadoVMInit=true)") + # Command display options command_group = parser.add_argument_group("Command Display Options") diff --git a/src/main/java/com/example/loader/weights/ModelLoader.java b/src/main/java/com/example/loader/weights/ModelLoader.java index 45a55994..4089b43a 100644 --- a/src/main/java/com/example/loader/weights/ModelLoader.java +++ b/src/main/java/com/example/loader/weights/ModelLoader.java @@ -89,7 +89,7 @@ public static Weights loadWeights(Map tensorEntries, Co GGMLTensorEntry outputWeight = tensorEntries.getOrDefault("output.weight", tokenEmbeddings); if (LlamaApp.USE_TORNADOVM) { - System.out.println("Loading weights in TornadoVM format"); + System.out.println("Loading model weights in TornadoVM format (converting " + outputWeight.ggmlType() + " -> " + GGMLType.F16 + ")"); return createTornadoVMWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight); } else { return createStandardWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight); diff --git a/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java b/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java index 0f2bd8eb..e072fbec 100644 --- a/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java +++ b/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java @@ -17,342 +17,341 @@ import java.util.List; // @formatter:off -/** - * TornadoVMLayerPlanner orchestrates the execution planning for transformer model inference - * on GPU using the TornadoVM framework. - * - * This class is responsible for: - * - Creating task graphs for each layer of the neural network - * - Managing GPU memory transfers between layers - * - Configuring worker grids for optimal GPU utilization - * - Setting up the execution schedule for the entire forward pass - * - * The planner implements a layered approach where: - * - Each layer is represented as a separate TaskGraph - * - Data transfers are optimized to minimize host-device communication - * - Worker grids are configured for different types of operations (attention, FFN, etc.) - * - The entire pipeline is scheduled to run efficiently on GPU - * - * Key optimizations include: - * - One-time transfer of static data (weights, caches) - * - Per-execution transfer of dynamic data (position, activations) - * - Device-to-device data consumption between layers - * - Parallelized attention computation across heads - * - * @see TaskGraph - * @see GridScheduler - * @see Llama - */ -// @formatter:on -public class TornadoVMLayerPlanner { - private static final int LOCAL_WORK_GROUP_SIZE_ALLOC = 32; - - private final State state; - private final Configuration config; - private final Weights weights; - private final KernelContext context; - /** - * Constructs a TornadoVMLayerPlanner for the given Llama model. + * TornadoVMLayerPlanner orchestrates the execution planning for transformer model inference + * on GPU using the TornadoVM framework. + * + * This class is responsible for: + * - Creating task graphs for each layer of the neural network + * - Managing GPU memory transfers between layers + * - Configuring worker grids for optimal GPU utilization + * - Setting up the execution schedule for the entire forward pass * - * @param state - * The state object containing model tensors and buffers - * @param model - * The Llama model instance containing configuration and weights + * The planner implements a layered approach where: + * - Each layer is represented as a separate TaskGraph + * - Data transfers are optimized to minimize host-device communication + * - Worker grids are configured for different types of operations (attention, FFN, etc.) + * - The entire pipeline is scheduled to run efficiently on GPU + * + * Key optimizations include: + * - One-time transfer of static data (weights, caches) + * - Per-execution transfer of dynamic data (position, activations) + * - Device-to-device data consumption between layers + * - Parallelized attention computation across heads + * + * @see TaskGraph + * @see GridScheduler + * @see Llama */ - public TornadoVMLayerPlanner(State state, Llama model) { - this.state = state; - this.config = model.configuration(); - this.weights = model.weights(); - this.context = new KernelContext(); - } + // @formatter:on + public class TornadoVMLayerPlanner { + private static final int LOCAL_WORK_GROUP_SIZE_ALLOC = 32; + private static final int THREAD_SCALE_FOR_LOGITS = 8; - public Tuple2, GridScheduler> setupTornadoForwardPlanLayered() { - List taskGraphs = new ArrayList<>(); + private final State state; + private final Configuration config; + private final Weights weights; + private final KernelContext context; - state.temp.init(0.0f); - state.tempFFN.init(0.0f); - state.tempLogits.init(0.0f); + /** + * Constructs a TornadoVMLayerPlanner for the given Llama model. + * + * @param state + * The state object containing model tensors and buffers + * @param model + * The Llama model instance containing configuration and weights + */ + public TornadoVMLayerPlanner(State state, Llama model) { + this.state = state; + this.config = model.configuration(); + this.weights = model.weights(); + this.context = new KernelContext(); + } - // @formatter:off - TaskGraph activationUpdate = new TaskGraph("activationUpdate") - .transferToDevice(DataTransferMode.EVERY_EXECUTION, state.wrapX) - .task("updateX", TransformerComputeKernels::emptyTaskToForceCopyIn, state.wrapX) - .persistOnDevice(state.wrapX); - taskGraphs.add(activationUpdate.snapshot()); + public Tuple2, GridScheduler> setupTornadoForwardPlanLayered() { + List taskGraphs = new ArrayList<>(); - TaskGraph unifiedLayer = null; - for (int layerIndex =0; layerIndex < config.numberOfLayers; layerIndex++) { - unifiedLayer = new TaskGraph("layer_" + layerIndex); - unifiedLayer.consumeFromDevice(state.wrapX); - unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION, - //Copy-in weights per layer for batched-layered layout - weights.rms_att_weightLayered[layerIndex], - weights.wqLayered[layerIndex], - weights.wkLayered[layerIndex], - weights.wvLayered[layerIndex], - weights.woLayered[layerIndex], - weights.rms_ffn_weightLayered[layerIndex], - weights.w1Layered[layerIndex], - weights.w2Layered[layerIndex], - weights.w3Layered[layerIndex] - ); - unifiedLayer = configureLayerDataTransfers(unifiedLayer, layerIndex); - unifiedLayer.task("reductionsOneBlock" , TransformerComputeKernelsLayered::reductionOneBlockWithLayer, context, state.temp, - state.wrapX, config.dim, config.rmsNormEps, state.localSize) - .task("mapContext", TransformerComputeKernelsLayered::reductionOneBlock2WithLayer, context, state.wrapXb, - state.wrapX, weights.rms_att_weightLayered[layerIndex], state.temp) - .task("qmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, - state.wrapXb, state.wrapQ, weights.wqLayered[layerIndex], config.dim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .task("kmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, - state.wrapXb, state.wrapK, weights.wkLayered[layerIndex], config.dim, config.kvDim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .task("vmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, - state.wrapXb, state.wrapV, weights.wvLayered[layerIndex], config.dim, config.kvDim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .task("rope", TransformerComputeKernelsLayered::ropeRotation,context, - state.positionHolder, state.wrapQ, state.wrapK, config.kvDim, - config.headSize) - .task("copyToCaches", TransformerComputeKernelsLayered::copyToCache, - state.wrapKeyCache, state.wrapK, state.wrapValueCache, state.wrapV, state.positionHolder, config.kvDim, layerIndex, config.contextLength) - .task("parallel-attention", TransformerComputeKernelsLayered::processHeadsFlashAttention, context, - state.wrapQ, state.wrapKeyCache, state.wrapValueCache, state.wrapXb, - config.numberOfHeads, config.headSize, config.kvDim, config.kvMul, - state.positionHolder, layerIndex, config.contextLength) - .task("matmul1", TransformerComputeKernelsLayered::matrixVectorGenericWithResidual, context, - state.wrapXb, state.wrapX, weights.woLayered[layerIndex], config.dim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .task("reductionsOneBlockFFN", TransformerComputeKernelsLayered::reductionOneBlockWithLayer, context, state.tempFFN, - state.wrapX, config.dim, config.rmsNormEps, state.localSize) - .task("mapContextFFN", TransformerComputeKernelsLayered::reductionOneBlock2WithLayer, context, state.wrapXb, - state.wrapX, weights.rms_ffn_weightLayered[layerIndex], state.tempFFN) - .task("fused_ffn_w1_w3", TransformerComputeKernelsLayered::fusedFeedForwardWithSiLUAndGLUActivation, context, - state.wrapXb, state.wrapHb, weights.w1Layered[layerIndex], weights.w3Layered[layerIndex], config.dim, config.hiddenDim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .task("projectionTwo", TransformerComputeKernelsLayered::matrixVectorGenericWithResidual, context, - state.wrapHb, state.wrapX, weights.w2Layered[layerIndex], config.hiddenDim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) - .persistOnDevice( - state.wrapX + state.temp.init(0.0f); + state.tempFFN.init(0.0f); + state.tempLogits.init(0.0f); + + // @formatter:off + TaskGraph activationUpdate = new TaskGraph("activationUpdate") + .transferToDevice(DataTransferMode.EVERY_EXECUTION, state.wrapX) + .task("updateX", TransformerComputeKernels::emptyTaskToForceCopyIn, state.wrapX) + .persistOnDevice(state.wrapX); + taskGraphs.add(activationUpdate.snapshot()); + + TaskGraph unifiedLayer = null; + for (int layerIndex =0; layerIndex < config.numberOfLayers; layerIndex++) { + unifiedLayer = new TaskGraph("layer_" + layerIndex); + unifiedLayer.consumeFromDevice(state.wrapX); + unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION, + //Copy-in weights per layer for batched-layered layout + weights.rms_att_weightLayered[layerIndex], + weights.wqLayered[layerIndex], + weights.wkLayered[layerIndex], + weights.wvLayered[layerIndex], + weights.woLayered[layerIndex], + weights.rms_ffn_weightLayered[layerIndex], + weights.w1Layered[layerIndex], + weights.w2Layered[layerIndex], + weights.w3Layered[layerIndex] ); - taskGraphs.add(unifiedLayer.snapshot()); - } - - TaskGraph lastUnifiedLayer = unifiedLayer; - TaskGraph logits = new TaskGraph("logits") - .consumeFromDevice(lastUnifiedLayer.getTaskGraphName(), - state.wrapX - ) - .transferToDevice(DataTransferMode.EVERY_EXECUTION, - state.tempLogits - ) - .transferToDevice(DataTransferMode.FIRST_EXECUTION, - context, - state.wrapLogits, - weights.wclsHalfFloat, - weights.rms_final_weight_as_floatArray - ) - .task("reductionsOneBlockLogits", TransformerComputeKernels::reductionOneBlockWithLayer, context, state.tempLogits, - state.wrapX, config.dim, config.rmsNormEps, state.localSize) - .task("mapContextLogits", TransformerComputeKernels::reductionOneBlock2WithLogits, context, state.wrapX, - weights.rms_final_weight_as_floatArray, state.tempLogits); - logits = configureQuantizedMatrixVectorFinalWeight(logits); - logits.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapLogits); - taskGraphs.add(logits.snapshot()); - // @formatter:on + unifiedLayer = configureLayerDataTransfers(unifiedLayer, layerIndex); + unifiedLayer.task("reductionsOneBlock" , TransformerComputeKernelsLayered::reductionOneBlockWithLayer, context, state.temp, + state.wrapX, config.dim, config.rmsNormEps, state.localSize) + .task("mapContext", TransformerComputeKernelsLayered::reductionOneBlock2WithLayer, context, state.wrapXb, + state.wrapX, weights.rms_att_weightLayered[layerIndex], state.temp) + .task("qmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, + state.wrapXb, state.wrapQ, weights.wqLayered[layerIndex], config.dim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .task("kmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, + state.wrapXb, state.wrapK, weights.wkLayered[layerIndex], config.dim, config.kvDim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .task("vmatmul", TransformerComputeKernelsLayered::matrixVectorGeneric, context, + state.wrapXb, state.wrapV, weights.wvLayered[layerIndex], config.dim, config.kvDim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .task("rope", TransformerComputeKernelsLayered::ropeRotation,context, + state.positionHolder, state.wrapQ, state.wrapK, config.kvDim, + config.headSize) + .task("copyToCaches", TransformerComputeKernelsLayered::copyToCache, + state.wrapKeyCache, state.wrapK, state.wrapValueCache, state.wrapV, state.positionHolder, config.kvDim, layerIndex, config.contextLength) + .task("parallel-attention", TransformerComputeKernelsLayered::processHeadsFlashAttention, context, + state.wrapQ, state.wrapKeyCache, state.wrapValueCache, state.wrapXb, + config.numberOfHeads, config.headSize, config.kvDim, config.kvMul, + state.positionHolder, layerIndex, config.contextLength) + .task("matmul1", TransformerComputeKernelsLayered::matrixVectorGenericWithResidual, context, + state.wrapXb, state.wrapX, weights.woLayered[layerIndex], config.dim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .task("reductionsOneBlockFFN", TransformerComputeKernelsLayered::reductionOneBlockWithLayer, context, state.tempFFN, + state.wrapX, config.dim, config.rmsNormEps, state.localSize) + .task("mapContextFFN", TransformerComputeKernelsLayered::reductionOneBlock2WithLayer, context, state.wrapXb, + state.wrapX, weights.rms_ffn_weightLayered[layerIndex], state.tempFFN) + .task("fused_ffn_w1_w3", TransformerComputeKernelsLayered::fusedFeedForwardWithSiLUAndGLUActivation, context, + state.wrapXb, state.wrapHb, weights.w1Layered[layerIndex], weights.w3Layered[layerIndex], config.dim, config.hiddenDim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .task("projectionTwo", TransformerComputeKernelsLayered::matrixVectorGenericWithResidual, context, + state.wrapHb, state.wrapX, weights.w2Layered[layerIndex], config.hiddenDim, config.dim, LOCAL_WORK_GROUP_SIZE_ALLOC) + .persistOnDevice( + state.wrapX + ); + taskGraphs.add(unifiedLayer.snapshot()); + } - return new Tuple2<>(taskGraphs, setupGridSchedulersLayered()); - } + TaskGraph lastUnifiedLayer = unifiedLayer; + TaskGraph logits = new TaskGraph("logits") + .consumeFromDevice(lastUnifiedLayer.getTaskGraphName(), + state.wrapX + ) + .transferToDevice(DataTransferMode.EVERY_EXECUTION, + state.tempLogits + ) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, + context, + state.wrapLogits, + weights.wclsHalfFloat, + weights.rms_final_weight_as_floatArray + ) + .task("reductionsOneBlockLogits", TransformerComputeKernels::reductionOneBlockWithLayer, context, state.tempLogits, + state.wrapX, config.dim, config.rmsNormEps, state.localSize) + .task("mapContextLogits", TransformerComputeKernels::reductionOneBlock2WithLogits, context, state.wrapX, + weights.rms_final_weight_as_floatArray, state.tempLogits); + logits = configureQuantizedMatrixVectorFinalWeight(logits); + logits.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapLogits); + taskGraphs.add(logits.snapshot()); + // @formatter:on - // @formatter:off - /** - * Configures the final projection layer in the task graph based on weight quantization type. - * - * This method adds a "projection" task to compute the final logits by performing a - * matrix-vector multiplication between the model's output embeddings and the classifier - * weights (wcls). The computation kernel used depends on the quantization format. - * - * Supported quantization types: - * - Q8_0: 8-bit quantization with uniform scaling per 32-element block - * - Q4_0: 4-bit quantization with uniform scaling per 32-element block - * - * The task multiplies: - * - weights.wclsByteArray: Quantized classifier weights (vocab_size x dim) - * - state.wrapX: Current layer output (dim) - * - Result: state.wrapLogits: Raw logits (vocab_size) - * - * @param logits The existing task graph to extend with the projection operation - * @return The modified task graph with the projection task added - * @throws UnsupportedOperationException If weights.weightType is not Q8_0 or Q4_0 - */ - // @formatter:on - private TaskGraph configureQuantizedMatrixVectorFinalWeight(TaskGraph logits) { - switch (weights.weightType) { - case Q8_0: - logits.task("projection", TransformerComputeKernelsLayered::matrixVectorGeneric, // - context, - state.wrapX, state.wrapLogits, weights.wclsHalfFloat, // - config.dim, config.vocabularySize, LOCAL_WORK_GROUP_SIZE_ALLOC * 2); // - break; - case Q4_0: - logits.task("projection", TransformerComputeKernelsLayered::matrixVectorGeneric, // - context, - state.wrapX, state.wrapLogits, weights.wclsHalfFloat, // - config.dim, config.vocabularySize, LOCAL_WORK_GROUP_SIZE_ALLOC * 2); // - break; - default: - throw new UnsupportedOperationException("Unsupported weight quantization type: " + weights.weightType + ". Only Q8_0 and Q4_0 are supported."); + return new Tuple2<>(taskGraphs, setupGridSchedulersLayered()); } - return logits; - } - /** - * Configures data transfer operations for a specific layer in the neural network task graph. - * - * This method manages GPU memory transfers with optimized data movement strategies: - * This optimization pattern minimizes data movement by: - * 1. Using one-time transfers for static data - * 2. Reusing intermediate results already on GPU from previous layers - * 3. Only transferring // - * dynamic data that changes per execution - * - * @param unifiedLayer - * The task graph representing this layer's operations - * @param layerIndex - * Index of the current layer (0-based) - * @return The configured task graph with appropriate data transfer operations - */ - private TaskGraph configureLayerDataTransfers(TaskGraph unifiedLayer, int layerIndex) { - // First layer: Transfer initial data to device (one-time transfer) - if (layerIndex == 0) { - // Transfer all attention-related data: query, key, value matrices and their caches - unifiedLayer.transferToDevice(DataTransferMode.EVERY_EXECUTION, state.positionHolder, state.temp, state.tempFFN); // - unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION, // - context, state.wrapXb, state.wrapXb2, // - state.wrapQ, state.wrapK, state.wrapV, // - state.wrapKeyCache, state.wrapValueCache, // - state.wrapAtt, state.wrapHb); // - } else { - // Subsequent layers: Consume data already on device from previous layer - unifiedLayer.consumeFromDevice(context, state.wrapXb, state.wrapXb2, // - state.wrapQ, state.wrapK, state.wrapV, // - state.wrapKeyCache, state.wrapValueCache, // - state.wrapAtt, state.wrapHb, // - state.positionHolder // - ); + // @formatter:off + /** + * Configures the final projection layer in the task graph based on weight quantization type. + * + * This method adds a "projection" task to compute the final logits by performing a + * matrix-vector multiplication between the model's output embeddings and the classifier + * weights (wcls). The computation kernel used depends on the quantization format. + * + * Supported quantization types: + * - Q8_0: 8-bit quantization with uniform scaling per 32-element block + * - Q4_0: 4-bit quantization with uniform scaling per 32-element block + * + * The task multiplies: + * - weights.wclsByteArray: Quantized classifier weights (vocab_size x dim) + * - state.wrapX: Current layer output (dim) + * - Result: state.wrapLogits: Raw logits (vocab_size) + * + * @param logits The existing task graph to extend with the projection operation + * @return The modified task graph with the projection task added + * @throws UnsupportedOperationException If weights.weightType is not Q8_0 or Q4_0 + */ + // @formatter:on + private TaskGraph configureQuantizedMatrixVectorFinalWeight(TaskGraph logits) { + switch (weights.weightType) { + case Q8_0: + logits.task("projection", TransformerComputeKernelsLayered::matrixVectorGeneric, // + context, state.wrapX, state.wrapLogits, weights.wclsHalfFloat, // + config.dim, config.vocabularySize, LOCAL_WORK_GROUP_SIZE_ALLOC * THREAD_SCALE_FOR_LOGITS); // + break; + case Q4_0: + logits.task("projection", TransformerComputeKernelsLayered::matrixVectorGeneric, // + context, state.wrapX, state.wrapLogits, weights.wclsHalfFloat, // + config.dim, config.vocabularySize, LOCAL_WORK_GROUP_SIZE_ALLOC * 2); // + break; + default: + throw new UnsupportedOperationException("Unsupported weight quantization type: " + weights.weightType + ". Only Q8_0 and Q4_0 are supported."); + } + return logits; } - return unifiedLayer; - } - // @formatter:off - /** - * Sets up the grid scheduler configuration for a layered neural network forward pass. - * - * This method creates and configures worker grids for different types of GPU operations - * in the transformer/ML model pipeline. Each worker grid defines how work should be - * distributed across GPU threads (OpenCL work-items or CUDA threads). - * - * The method creates several worker profiles: - * - Single thread operations (activation updates) - * - RoPE (Rotary Position Embedding) operations - * - Matrix multiplications with different dimensions - * - RMS normalization operations - * - Parallel attention computations - * - Cache copying operations - * - Vocabulary projections - * - * Each worker grid maps to equivalent OpenCL NDRange or CUDA grid/block configurations: - * - setGlobalWork() ≈ OpenCL global_work_size ≈ CUDA grid dimensions × block dimensions - * - setLocalWork() ≈ OpenCL local_work_size ≈ CUDA block dimensions - * - * @return GridScheduler configured with all necessary worker grids for the model layers - */ - // @formatter:on - private GridScheduler setupGridSchedulersLayered() { - GridScheduler tornadoForwardScheduler = new GridScheduler(); + /** + * Configures data transfer operations for a specific layer in the neural network task graph. + * + * This method manages GPU memory transfers with optimized data movement strategies: + * This optimization pattern minimizes data movement by: + * 1. Using one-time transfers for static data + * 2. Reusing intermediate results already on GPU from previous layers + * 3. Only transferring // + * dynamic data that changes per execution + * + * @param unifiedLayer + * The task graph representing this layer's operations + * @param layerIndex + * Index of the current layer (0-based) + * @return The configured task graph with appropriate data transfer operations + */ + private TaskGraph configureLayerDataTransfers(TaskGraph unifiedLayer, int layerIndex) { + // First layer: Transfer initial data to device (one-time transfer) + if (layerIndex == 0) { + // Transfer all attention-related data: query, key, value matrices and their caches + unifiedLayer.transferToDevice(DataTransferMode.EVERY_EXECUTION, state.positionHolder, state.temp, state.tempFFN); // + unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION, // + context, state.wrapXb, state.wrapXb2, // + state.wrapQ, state.wrapK, state.wrapV, // + state.wrapKeyCache, state.wrapValueCache, // + state.wrapAtt, state.wrapHb); // + } else { + // Subsequent layers: Consume data already on device from previous layer + unifiedLayer.consumeFromDevice(context, state.wrapXb, state.wrapXb2, // + state.wrapQ, state.wrapK, state.wrapV, // + state.wrapKeyCache, state.wrapValueCache, // + state.wrapAtt, state.wrapHb, // + state.positionHolder // + ); + } + return unifiedLayer; + } - // Single worker for tasks running with a single thread - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[1,1,1], localWorkSize=[1,1,1]) - // CUDA equivalent: kernel<<>> - WorkerGrid singleWorker = new WorkerGrid1D(1); - singleWorker.setGlobalWork(1, 1, 1); - singleWorker.setLocalWork(1, 1, 1); + // @formatter:off + /** + * Sets up the grid scheduler configuration for a layered neural network forward pass. + * + * This method creates and configures worker grids for different types of GPU operations + * in the transformer/ML model pipeline. Each worker grid defines how work should be + * distributed across GPU threads (OpenCL work-items or CUDA threads). + * + * The method creates several worker profiles: + * - Single thread operations (activation updates) + * - RoPE (Rotary Position Embedding) operations + * - Matrix multiplications with different dimensions + * - RMS normalization operations + * - Parallel attention computations + * - Cache copying operations + * - Vocabulary projections + * + * Each worker grid maps to equivalent OpenCL NDRange or CUDA grid/block configurations: + * - setGlobalWork() ≈ OpenCL global_work_size ≈ CUDA grid dimensions × block dimensions + * - setLocalWork() ≈ OpenCL local_work_size ≈ CUDA block dimensions + * + * @return GridScheduler configured with all necessary worker grids for the model layers + */ + // @formatter:on + private GridScheduler setupGridSchedulersLayered() { + GridScheduler tornadoForwardScheduler = new GridScheduler(); - // config.dim / 2 Worker for RoPE - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim/2,1,1], localWorkSize=[128,1,1]) - // CUDA equivalent: kernel<<>> - WorkerGrid ropeWorker = new WorkerGrid1D(config.dim / 2); - ropeWorker.setGlobalWork(config.dim / 2, 1, 1); - ropeWorker.setLocalWork(128, 1, 1); + // Single worker for tasks running with a single thread + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[1,1,1], localWorkSize=[1,1,1]) + // CUDA equivalent: kernel<<>> + WorkerGrid singleWorker = new WorkerGrid1D(1); + singleWorker.setGlobalWork(1, 1, 1); + singleWorker.setLocalWork(1, 1, 1); - // config.dim Worker for Row major access - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) - // CUDA equivalent: kernel<<>> - int configDimRowMajorGlobal = config.dim * LOCAL_WORK_GROUP_SIZE_ALLOC; - WorkerGrid configDimRowMajorGlobalWorker = new WorkerGrid1D(configDimRowMajorGlobal); - configDimRowMajorGlobalWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); + // config.dim / 2 Worker for RoPE + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim/2,1,1], localWorkSize=[128,1,1]) + // CUDA equivalent: kernel<<>> + WorkerGrid ropeWorker = new WorkerGrid1D(config.dim / 2); + ropeWorker.setGlobalWork(config.dim / 2, 1, 1); + ropeWorker.setLocalWork(128, 1, 1); - // config.kvDim Worker for Row major access - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.kvDim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) - // CUDA equivalent: kernel<<>> - int configKvDimRowMajorGlobal = config.kvDim * LOCAL_WORK_GROUP_SIZE_ALLOC; - WorkerGrid configKvDimRowMajorGlobalWorker = new WorkerGrid1D(configKvDimRowMajorGlobal); - configKvDimRowMajorGlobalWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); + // config.dim Worker for Row major access + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) + // CUDA equivalent: kernel<<>> + int configDimRowMajorGlobal = config.dim * LOCAL_WORK_GROUP_SIZE_ALLOC; + WorkerGrid configDimRowMajorGlobalWorker = new WorkerGrid1D(configDimRowMajorGlobal); + configDimRowMajorGlobalWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); - // config.hiddenDim * 32 Worker for Row major access - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.hiddenDim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) - // CUDA equivalent: kernel<<>> - int configHiddenDimRowMajor = config.hiddenDim * LOCAL_WORK_GROUP_SIZE_ALLOC; - WorkerGrid configHiddenDimRowMajorWorker = new WorkerGrid1D(configHiddenDimRowMajor); - configHiddenDimRowMajorWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); + // config.kvDim Worker for Row major access + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.kvDim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) + // CUDA equivalent: kernel<<>> + int configKvDimRowMajorGlobal = config.kvDim * LOCAL_WORK_GROUP_SIZE_ALLOC; + WorkerGrid configKvDimRowMajorGlobalWorker = new WorkerGrid1D(configKvDimRowMajorGlobal); + configKvDimRowMajorGlobalWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); - // RMSNorm worker configuration - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim,1,1], localWorkSize=[256,1,1]) - // CUDA equivalent: kernel<<>> - WorkerGrid rmsNormWorker = new WorkerGrid1D(config.dim); - rmsNormWorker.setGlobalWork(config.dim, 1, 1); // Set global work size to total dimension - rmsNormWorker.setLocalWork(256, 1, 1); // Set local work size to 256 (standard efficient size) + // config.hiddenDim * 32 Worker for Row major access + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.hiddenDim*LOCAL_WORK_GROUP_SIZE_ALLOC,1,1], localWorkSize=[LOCAL_WORK_GROUP_SIZE_ALLOC,1,1]) + // CUDA equivalent: kernel<<>> + int configHiddenDimRowMajor = config.hiddenDim * LOCAL_WORK_GROUP_SIZE_ALLOC; + WorkerGrid configHiddenDimRowMajorWorker = new WorkerGrid1D(configHiddenDimRowMajor); + configHiddenDimRowMajorWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC, 1, 1); - // Parallel attention worker configuration - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.numberOfHeads,1,1], localWorkSize=[4,1,1]) - // CUDA equivalent: kernel<<>> - WorkerGrid parallelAttentionWorker = new WorkerGrid1D(config.numberOfHeads); - // the global group work size is numberOfHeads * localWorkGroupSize, where the localWorkGroupSize is currently 4 - parallelAttentionWorker.setGlobalWork(config.numberOfHeads * 4, 1, 1); - parallelAttentionWorker.setLocalWork(4, 1, 1); // Set local work size to 4 (for parallel attention) + // RMSNorm worker configuration + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim,1,1], localWorkSize=[256,1,1]) + // CUDA equivalent: kernel<<>> + WorkerGrid rmsNormWorker = new WorkerGrid1D(config.dim); + rmsNormWorker.setGlobalWork(config.dim, 1, 1); // Set global work size to total dimension + rmsNormWorker.setLocalWork(256, 1, 1); // Set local work size to 256 (standard efficient size) - // Copy to caches worker configuration - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim,1,1], localWorkSize=[128,1,1]) - // CUDA equivalent: kernel<<>> - WorkerGrid copyToCachesWorker = new WorkerGrid1D(config.kvDim); - copyToCachesWorker.setGlobalWork(config.dim, 1, 1); - copyToCachesWorker.setLocalWork(128, 1, 1); // Set local work size to 32 (for copying to caches) + // Parallel attention worker configuration + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.numberOfHeads,1,1], localWorkSize=[4,1,1]) + // CUDA equivalent: kernel<<>> + WorkerGrid parallelAttentionWorker = new WorkerGrid1D(config.numberOfHeads); + // the global group work size is numberOfHeads * localWorkGroupSize, where the localWorkGroupSize is currently 4 + parallelAttentionWorker.setGlobalWork(config.numberOfHeads * 8, 1, 1); + parallelAttentionWorker.setLocalWork(8, 1, 1); // Set local work size to 4 (for parallel attention) - // Map workers to tasks - tornadoForwardScheduler.addWorkerGrid("activationUpdate.updateX", singleWorker); - for (int i = 0; i < config.numberOfLayers; i++) { - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".qmatmul", configDimRowMajorGlobalWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".kmatmul", configKvDimRowMajorGlobalWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".vmatmul", configKvDimRowMajorGlobalWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".rope", ropeWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".matmul1", configDimRowMajorGlobalWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".projectionTwo", configDimRowMajorGlobalWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".fused_ffn_w1_w3", configHiddenDimRowMajorWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock", rmsNormWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".mapContext", rmsNormWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlockFFN", rmsNormWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".mapContextFFN", rmsNormWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".parallel-attention", parallelAttentionWorker); - tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".copyToCaches", copyToCachesWorker); - } + // Copy to caches worker configuration + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim,1,1], localWorkSize=[128,1,1]) + // CUDA equivalent: kernel<<>> + WorkerGrid copyToCachesWorker = new WorkerGrid1D(config.kvDim); + copyToCachesWorker.setGlobalWork(config.dim, 1, 1); + copyToCachesWorker.setLocalWork(128, 1, 1); // Set local work size to 32 (for copying to caches) - // Vocabulary worker configuration - // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.vocabularySize,1,1], localWorkSize=[16,1,1]) - // CUDA equivalent: kernel<<>> - int vocabSizeRowMajor = config.vocabularySize * LOCAL_WORK_GROUP_SIZE_ALLOC * 2 ; - WorkerGrid vocabWorker = new WorkerGrid1D(vocabSizeRowMajor); - vocabWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC * 2, 1, 1); + // Map workers to tasks + tornadoForwardScheduler.addWorkerGrid("activationUpdate.updateX", singleWorker); + for (int i = 0; i < config.numberOfLayers; i++) { + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".qmatmul", configDimRowMajorGlobalWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".kmatmul", configKvDimRowMajorGlobalWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".vmatmul", configKvDimRowMajorGlobalWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".rope", ropeWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".matmul1", configDimRowMajorGlobalWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".projectionTwo", configDimRowMajorGlobalWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".fused_ffn_w1_w3", configHiddenDimRowMajorWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock", rmsNormWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".mapContext", rmsNormWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlockFFN", rmsNormWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".mapContextFFN", rmsNormWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".parallel-attention", parallelAttentionWorker); + tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".copyToCaches", copyToCachesWorker); + } - tornadoForwardScheduler.addWorkerGrid("logits.projection", vocabWorker); - tornadoForwardScheduler.addWorkerGrid("logits.reductionsOneBlockLogits", rmsNormWorker); - tornadoForwardScheduler.addWorkerGrid("logits.mapContextLogits", rmsNormWorker); + // Vocabulary worker configuration + // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.vocabularySize,1,1], localWorkSize=[16,1,1]) + // CUDA equivalent: kernel<<>> + int vocabSizeRowMajor = config.vocabularySize * LOCAL_WORK_GROUP_SIZE_ALLOC * THREAD_SCALE_FOR_LOGITS; + WorkerGrid vocabWorker = new WorkerGrid1D(vocabSizeRowMajor); + vocabWorker.setLocalWork(LOCAL_WORK_GROUP_SIZE_ALLOC * THREAD_SCALE_FOR_LOGITS, 1, 1); - return tornadoForwardScheduler; - } + tornadoForwardScheduler.addWorkerGrid("logits.projection", vocabWorker); + tornadoForwardScheduler.addWorkerGrid("logits.reductionsOneBlockLogits", rmsNormWorker); + tornadoForwardScheduler.addWorkerGrid("logits.mapContextLogits", rmsNormWorker); -} + return tornadoForwardScheduler; + } + + } diff --git a/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java b/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java index 69e99edc..5770fb5d 100644 --- a/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java +++ b/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java @@ -67,7 +67,7 @@ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Llama mod // Record time after warmup if (ENABLE_TORNADOVM_INIT_TIME) { warmupTime = System.nanoTime(); - System.err.printf("JIT warmup completed: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); + System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); } // 3. Perform copy-in of read-only weights and objects @@ -76,8 +76,8 @@ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Llama mod // Record final timing information if (ENABLE_TORNADOVM_INIT_TIME) { long copyTime = System.nanoTime(); - System.err.printf("Read-only weight transfer to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); - System.err.printf("Total TornadoVM initialization: %.2f ms\n\n", (copyTime - startTime) / 1_000_000.0); + System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); + System.err.printf("Finished TornadoVM initialization...\n \n"); } return tornadoVMPlan; diff --git a/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java b/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java index c67f8052..0b246d48 100644 --- a/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java +++ b/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java @@ -303,7 +303,7 @@ public static void processHeadsFlashAttention(KernelContext context, FloatArray int pos = positionHolder.get(0); int loff = layer * contextLength * kvDim; int kvHeadIdx = h / kvMul; - int BLOCK_SIZE_C = 4; + int BLOCK_SIZE_C = 8; // Allocate shared memory for tiled computation float[] q_shared = context.allocateFloatLocalArray(headSize);