Modern-Compilers-Lab · BrouthenKamel · Mar 12, 2026
diff --git a/llm_action/.env.example b/llm_action/.env.example
@@ -1,6 +1,7 @@
 # LLM
 ANTHROPIC_API_KEY = "sk-ant-api03-..."
 GEMINI_API_KEY = "..."
+GROQ_API_KEY = "gsk_..."
 
 # MLIR
 MLIR_SHARED_LIBS=/path/to/llvm-project/build/lib/libomp.so,/path/to/llvm-project/build/lib/libmlir_c_runner_utils.so,/path/to/llvm-project/build/lib/libmlir_runner_utils.so

diff --git a/llm_action/data/memref/conv2d/conv_2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir b/llm_action/data/memref/conv2d/conv_2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir
@@ -0,0 +1,10 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(%arg0: memref<128x32x7x7xf64>, %arg1: memref<256x32x1x1xf64>, %arg2: memref<128x256x7x7xf64>) -> i64 attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>, tag = "operation_0"} ins(%arg0, %arg1 : memref<128x32x7x7xf64>, memref<256x32x1x1xf64>) outs(%arg2 : memref<128x256x7x7xf64>)
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/conv2d/conv_2d_nchw_fchw_template.mlir b/llm_action/data/memref/conv2d/conv_2d_nchw_fchw_template.mlir
@@ -0,0 +1,29 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(
+    %arg0: memref<[N]x[C]x[H]x[W]xf64>,
+    %arg1: memref<[F]x[C]x[KH]x[KW]xf64>,
+    %arg2: memref<[N]x[F]x[OH]x[OW]xf64>
+  ) -> i64
+  attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.conv_2d_nchw_fchw
+      { dilations = dense<1> : tensor<2xi64>,
+        strides = dense<1> : tensor<2xi64>,
+        tag = "operation_0"
+      }
+      ins(
+        %arg0, %arg1 :
+        memref<[N]x[C]x[H]x[W]xf64>,
+        memref<[F]x[C]x[KH]x[KW]xf64>
+      )
+      outs(
+        %arg2 : memref<[N]x[F]x[OH]x[OW]xf64>
+      )
+
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/generic/generic_8_8_16_8_32.mlir b/llm_action/data/memref/generic/generic_8_8_16_8_32.mlir
@@ -0,0 +1,25 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(
+    %arg0: memref<8x8x16x8x32xf64>,
+    %arg1: memref<8x8x16x8x32xf64>
+  ) -> i64 attributes {llvm.emit_c_interface} {
+    %t0 = call @nanoTime() : () -> i64
+    linalg.generic {
+      indexing_maps = [
+        affine_map<(a,b,c,d,e) -> (a,b,c,d,e)>,
+        affine_map<(a,b,c,d,e) -> (a,b,c,d,e)>
+      ],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
+      tag = "operation_0"
+    } ins(%arg0 : memref<8x8x16x8x32xf64>)
+      outs(%arg1 : memref<8x8x16x8x32xf64>) {
+      ^bb0(%in: f64, %acc: f64):
+        %sum = arith.addf %acc, %in : f64
+        linalg.yield %sum : f64
+    }
+    %t1 = call @nanoTime() : () -> i64
+    %dt = arith.subi %t1, %t0 : i64
+    return %dt : i64
+  }
+}
diff --git a/llm_action/data/memref/generic/generic_template.mlir b/llm_action/data/memref/generic/generic_template.mlir
@@ -0,0 +1,25 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(
+    %arg0: memref<[A]x[B]x[C]x[D]x[E]xf64>,
+    %arg1: memref<[A]x[B]x[C]x[D]x[E]xf64>
+  ) -> i64 attributes {llvm.emit_c_interface} {
+    %t0 = call @nanoTime() : () -> i64
+    linalg.generic {
+      indexing_maps = [
+        affine_map<(a,b,c,d,e) -> (a,b,c,d,e)>,
+        affine_map<(a,b,c,d,e) -> (a,b,c,d,e)>
+      ],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
+      tag = "operation_0"
+    } ins(%arg0 : memref<[A]x[B]x[C]x[D]x[E]xf64>)
+      outs(%arg1 : memref<[A]x[B]x[C]x[D]x[E]xf64>) {
+      ^bb0(%in: f64, %acc: f64):
+        %sum = arith.addf %acc, %in : f64
+        linalg.yield %sum : f64
+    }
+    %t1 = call @nanoTime() : () -> i64
+    %dt = arith.subi %t1, %t0 : i64
+    return %dt : i64
+  }
+}
diff --git a/llm_action/data/memref/matmul/matmul_128_256_128.mlir b/llm_action/data/memref/matmul/matmul_128_256_128.mlir
@@ -0,0 +1,10 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(%arg0: memref<128x256xf64>, %arg1: memref<256x128xf64>, %arg2: memref<128x128xf64>) -> i64 attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.matmul {tag = "operation_0"} ins(%arg0, %arg1 : memref<128x256xf64>, memref<256x128xf64>) outs(%arg2 : memref<128x128xf64>)
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/matmul/matmul_24576_768_384.mlir b/llm_action/data/memref/matmul/matmul_24576_768_384.mlir
@@ -0,0 +1,10 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(%arg0: memref<24576x768xf64>, %arg1: memref<768x384xf64>, %arg2: memref<24576x384xf64>) -> i64 attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.matmul {tag = "operation_0"} ins(%arg0, %arg1 : memref<24576x768xf64>, memref<768x384xf64>) outs(%arg2 : memref<24576x384xf64>)
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/matmul/matmul_256_512_1024.mlir b/llm_action/data/memref/matmul/matmul_256_512_1024.mlir
@@ -0,0 +1,10 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(%arg0: memref<256x512xf64>, %arg1: memref<512x1024xf64>, %arg2: memref<256x1024xf64>) -> i64 attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.matmul {tag = "operation_0"} ins(%arg0, %arg1 : memref<256x512xf64>, memref<512x1024xf64>) outs(%arg2 : memref<256x1024xf64>)
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/matmul/matmul_512_512_512.mlir b/llm_action/data/memref/matmul/matmul_512_512_512.mlir
@@ -0,0 +1,10 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(%arg0: memref<512x512xf64>, %arg1: memref<512x512xf64>, %arg2: memref<512x512xf64>) -> i64 attributes {llvm.emit_c_interface} {
+    %0 = call @nanoTime() : () -> i64
+    linalg.matmul {tag = "operation_0"} ins(%arg0, %arg1 : memref<512x512xf64>, memref<512x512xf64>) outs(%arg2 : memref<512x512xf64>)
+    %2 = call @nanoTime() : () -> i64
+    %3 = arith.subi %2, %0 : i64
+    return %3 : i64
+  }
+}
diff --git a/llm_action/data/memref/matmul/matmul_template.mlir b/llm_action/data/memref/matmul/matmul_template.mlir
@@ -0,0 +1,14 @@
+module {
+  func.func private @nanoTime() -> i64 attributes {llvm.emit_c_interface}
+  func.func @main(
+    %arg0: memref<[I]x[J]xf64>,
+    %arg1: memref<[J]x[K]xf64>,
+    %arg2: memref<[I]x[K]xf64>
+    ) -> i64 attributes {llvm.emit_c_interface} {
+      %0 = call @nanoTime() : () -> i64
+      linalg.matmul {tag = "operation_0"} ins(%arg0, %arg1 : memref<[I]x[J]xf64>, memref<[J]x[K]xf64>) outs(%arg2 : memref<[I]x[K]xf64>)
+      %2 = call @nanoTime() : () -> i64
+      %3 = arith.subi %2, %0 : i64
+      return %3 : i64
+  }
+}
diff --git a/..._2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir → ..._2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir b/..._2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir → ..._2d_nchw_fchw_128_32_7_7_256_1_1_7_7.mlir
diff --git a/...ta/conv2d/conv_2d_nchw_fchw_template.mlir → ...or/conv2d/conv_2d_nchw_fchw_template.mlir b/...ta/conv2d/conv_2d_nchw_fchw_template.mlir → ...or/conv2d/conv_2d_nchw_fchw_template.mlir
diff --git a/...ion/data/generic/generic_8_8_16_8_32.mlir → ...a/tensor/generic/generic_8_8_16_8_32.mlir b/...ion/data/generic/generic_8_8_16_8_32.mlir → ...a/tensor/generic/generic_8_8_16_8_32.mlir
diff --git a/...action/data/generic/generic_template.mlir → ...data/tensor/generic/generic_template.mlir b/...action/data/generic/generic_template.mlir → ...data/tensor/generic/generic_template.mlir
diff --git a/...ction/data/matmul/matmul_128_256_128.mlir → ...ata/tensor/matmul/matmul_128_256_128.mlir b/...ction/data/matmul/matmul_128_256_128.mlir → ...ata/tensor/matmul/matmul_128_256_128.mlir
diff --git a/...ion/data/matmul/matmul_24576_768_384.mlir → ...a/tensor/matmul/matmul_24576_768_384.mlir b/...ion/data/matmul/matmul_24576_768_384.mlir → ...a/tensor/matmul/matmul_24576_768_384.mlir
diff --git a/...tion/data/matmul/matmul_256_512_1024.mlir → ...ta/tensor/matmul/matmul_256_512_1024.mlir b/...tion/data/matmul/matmul_256_512_1024.mlir → ...ta/tensor/matmul/matmul_256_512_1024.mlir
diff --git a/...ction/data/matmul/matmul_512_512_512.mlir → ...ata/tensor/matmul/matmul_512_512_512.mlir b/...ction/data/matmul/matmul_512_512_512.mlir → ...ata/tensor/matmul/matmul_512_512_512.mlir
diff --git a/llm_action/data/matmul/matmul_template.mlir → ...n/data/tensor/matmul/matmul_template.mlir b/llm_action/data/matmul/matmul_template.mlir → ...n/data/tensor/matmul/matmul_template.mlir
diff --git a/llm_action/docs/MCP_REFERENCE.md b/llm_action/docs/MCP_REFERENCE.md
@@ -123,7 +123,7 @@ Args:
 Returns:
     float: the median execution time in milliseconds.
 
-## measure_speedup(mlir_base_execution_time: float, mlir_optimized_execution_time: float, torch_execution_time: float) -> dict[str, float]
+## measure_speedup(mlir_base_execution_time: float, mlir_optimized_execution_time: float, torch_execution_time: Optional[float] = None) -> dict[str, float]
 Measures the speedup achieved by MLIR transformations.
 
 This tool compares the execution time of base code against transformed code

diff --git a/llm_action/playground/actions/tests/tiling.py b/llm_action/playground/actions/tests/tiling.py
@@ -1,5 +1,6 @@
 from llm_action.src.models import KernelType
 from llm_action.src.utils.persistence import load_kernel_code
+from llm_action.src.execution.mlir_execution import execute_mlir
 
 from llm_action.playground.actions.candidates.Tiling_af31 import TilingAction
 from llm_action.playground.actions.candidates.Tile import Tile
@@ -24,17 +25,29 @@
     ACTION = Tiling
 
     for kernel_type in [KernelType.MATMUL, KernelType.CONV2D, KernelType.GENERIC]:
-        print(f"--- Testing Tiling Action on {kernel_type.value} Kernel ---\n")
+        print(f"--- Testing {ACTION.__name__} on {kernel_type.value} Kernel ---\n")
         code = load_kernel_code(kernel_type)
         print(f"Original Code:\n{code}\n")
+
+
+
+        print(f"Original Execution; Success = {success}, Time = {original_time_ns} ns")
 
         parameters = params_per_kernel[kernel_type]
 
         print(f"Using Parameters: {parameters}\n")
-
+        
         if ACTION.precondition(code, parameters):
             transformed_code = ACTION.implement(code, parameters)
+
             print(f"Transformed Code:\n{transformed_code}\n")
+
+            transformed_time_ns, success = execute_mlir(transformed_code)
+
+            print(f"Transformed Execution; Success = {success}, Time = {transformed_time_ns} ns")
+
+            print(f"Speedup = {(original_time_ns / transformed_time_ns):.4f}")
+
             if ACTION.postcondition(code, transformed_code, parameters):
                 print(f"Postcondition satisfied: {ACTION.__name__} applied successfully.")
             else:

diff --git a/llm_action/resources/prompts/v1/action_enumeration.md b/llm_action/resources/prompts/v1/action_enumeration.md
@@ -331,12 +331,11 @@ Acceptable kernel-specific examples (when framed generically):
 ## Examples (non-exhaustive):
 - Tiling / blocking
 - Interchange (loop permutation)
-- Fusion (producer-consumer)
 - Vectorization (SIMD-friendly restructuring)
 - Parallelization / distribution
+- Promotion
 - Packing / layout transformation
 - Unrolling / jamming / peeling
-- Decomposition of complex ops
 - Bufferization strategy (conceptual)
 - Canonicalization / simplification (conceptual)
 - Special kernel-specific operations (e.g., im2col for convolution)
@@ -395,8 +394,8 @@ class ActionEnumeration(BaseModel):
 
 # Output Constraints
 
-- Produce **2-3 optimization intents**.
-- Each intent must contain **2-3 transformations**.
+- Produce **3-5 optimization intents**.
+- Each intent must contain **3-5 transformations**.
 - Use consistent transformation names across intents (avoid duplicates with different names).
 - Keep descriptions concise (1-2 sentences).
 - Do **not** include parameter knobs, preconditions, ordering rules, or code.

diff --git a/llm_action/resources/prompts/v1/action_implementation.md b/llm_action/resources/prompts/v1/action_implementation.md
@@ -265,6 +265,21 @@ You are **not** responsible for:
 Your job is to turn **one abstract transformation idea** into
 **one concrete executable action**.
 
+# Hardware Specifications
+- Primary target: **HPC-class CPU** — specifically **Intel Xeon E5-2680 v4 (Broadwell-class)**.
+- Topology:
+  * **28 physical cores** (2 sockets x 14 cores), **2 NUMA nodes**.
+  * **No SMT / Hyper-threading disabled** (threads per core = 1).
+- SIMD / ISA capabilities:
+  * **AVX2 + FMA available**.
+  * **No AVX-512** (do not assume AVX-512 vector widths, masks, or AVX-512-specific lowering).
+  * Practical vector lane guidance:
+    - FP32: typically 8 lanes per vector (256-bit)
+    - FP64: typically 4 lanes per vector (256-bit)
+- Cache hierarchy characteristics:
+  * L1d ~32KB per core, L2 ~256KB per core, shared L3 per socket (~tens of MB).
+- Number of cores in the execution environment (submitted MLIR/PyTorch jobs): **16 physical cores**.
+
 # Your Task
 
 You will be given the following inputs:
@@ -337,7 +352,7 @@ Each Action must define the following conceptual stages:
 
 ## Tooling Available (Allowed and Encouraged)
 
-You may use the following tool to validate the MLIR transform while synthesizing it:
+You may use the following MCP tools to validate the MLIR transform while synthesizing it:
 
 - `delegate_documentation_lookup(task: str) -> str`
   Delegates Transform dialect documentation lookup to a deterministic retrieval agent. Example tasks:
@@ -346,13 +361,13 @@ You may use the following tool to validate the MLIR transform while synthesizing
   - "What is the Transform dialect op for loop interchange?"
   This lookup agent provides authoritative, pre-scraped MLIR Transform dialect documentation, including exact operation names, required handles, key attributes, and minimal Transform IR skeletons, and should be used to ground Transform dialect usage before implementation.
 
-- `transform_code(code: str, transformation_code: str) -> str`
+- `transform_mlir_code(code: str, transformation_code: str) -> str`
   Applies Transform dialect code and returns transformed MLIR.
 
-- `execute_code(code: str) -> tuple[int, bool]`
+- `execute_mlir_code(code: str) -> tuple[float, bool]`
   Executes the payload and returns (execution_time in ms, success_flag).
 
-- `measure_speedup(base_execution_time: float, execution_time: float) -> float`
+- `measure_speedup(mlir_base_execution_time: float, mlir_optimized_execution_time: float) -> float`
   Computes the relative speedup between baseline and transformed execution times.
 
 Use these tools to ensure your transform snippet is syntactically valid, changes the IR when it should, and preserves executability when appropriate. Make sure to input actual MLIR code instances (actual numbers instead of [I], [OH], etc.).
@@ -370,17 +385,17 @@ For each kernel instance you test during synthesis, follow systematically this p
      call `delegate_documentation_lookup(...)` before writing or revising transform IR.
 
 2. **Baseline execution sanity**
-   - Call `execute_code(original_code)`.
+   - Call `execute_mlir_code(original_code)`.
    - Require `success_flag == True`.
    - If baseline execution fails, do not proceed with transform testing on that instance.
 
 3. **Transform application sanity**
-   - Call `transform_code(original_code, transform_ir)`.
+   - Call `transform_mlir_code(original_code, transform_ir)`.
    - Require that the returned MLIR differs from the input (`transformed.strip() != original.strip()`).
    - If the transform produces identical code or throws, treat it as a failed transform attempt.
 
 4. **Post-transform execution sanity**
-   - Call `execute_code(transformed_code)`.
+   - Call `execute_mlir_code(transformed_code)`.
    - Require `success_flag == True`.
    - If execution fails, the transform is not acceptable and must be revised.
 
@@ -401,18 +416,14 @@ and must NOT materialize large tensor tiles as vectors.
 When a transformation introduces `vector<...>` types, you MUST ensure:
 
 1) **Bound total vector size**
-   - Let `N = product(static vector dimensions)`.
-   - Limits by element type:
-     - `f64` / `i64`: `N ≤ 16`
-     - `f32` / `i32`: `N ≤ 32`
-     - `f16` / `bf16` / `i16`: `N ≤ 64`
-     - `i8`: `N ≤ 128`
+   - Let `N = product(static vector dimensions: multiplication of the vector elements)`.
+   - Limits `N ≤ 1024`
    - If any vector exceeds its bound → **reject the candidate immediately**.
 
 2) **Limit vector rank**
    - Prefer rank-1 vectors: `vector<kxf32>`
-   - Allow rank-2 and rank-3 vectors only if small (e.g. `vector<4x8xf32>, vector<4x4x4xf32>`).
-   - Rank ≥ 4 vectors are **disallowed**, regardless of element count.
+   - Allow rank-2 vectors only if small (e.g. `vector<4x8xf32>`)
+   - Rank ≥ 3 vectors are **disallowed**, unless they are very small (e.g. `vector<2x2x2xf32>`, `vector<4x4x4xf32>`, ...).
 
 3) **No tile-as-vector lowering**
    - Vectors resembling whole tiles or buffers

diff --git a/llm_action/results/documentation_lookup/gemini-2.5-flash/388fdf44/lookup.txt b/llm_action/results/documentation_lookup/gemini-2.5-flash/388fdf44/lookup.txt
@@ -0,0 +1,40 @@
+TASK:
+How to do vectorization in MLIR Transform dialect?
+
+RESPONSE:
+{
+  "error": {
+    "code": 429,
+    "message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 443.929576ms.",
+    "status": "RESOURCE_EXHAUSTED",
+    "details": [
+      {
+        "@type": "type.googleapis.com/google.rpc.Help",
+        "links": [
+          {
+            "description": "Learn more about Gemini API quotas",
+            "url": "https://ai.google.dev/gemini-api/docs/rate-limits"
+          }
+        ]
+      },
+      {
+        "@type": "type.googleapis.com/google.rpc.QuotaFailure",
+        "violations": [
+          {
+            "quotaMetric": "generativelanguage.googleapis.com/generate_content_free_tier_requests",
+            "quotaId": "GenerateRequestsPerDayPerProjectPerModel-FreeTier",
+            "quotaDimensions": {
+              "location": "global",
+              "model": "gemini-2.5-flash"
+            },
+            "quotaValue": "20"
+          }
+        ]
+      },
+      {
+        "@type": "type.googleapis.com/google.rpc.RetryInfo",
+        "retryDelay": "0s"
+      }
+    ]
+  }
+}