From 4e97376aed5cca60282906521c4458f85540ebeb Mon Sep 17 00:00:00 2001 From: majin0824 Date: Mon, 2 Feb 2026 19:23:57 +0800 Subject: [PATCH 1/3] =?UTF-8?q?fuzz=E9=9A=8F=E6=9C=BA=E7=94=9F=E6=88=90op?= =?UTF-8?q?=E7=BB=84=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/fuzzer/README.md | 263 +++++++ src/fuzzer/__init__.py | 39 ++ src/fuzzer/conftest.py | 43 ++ src/fuzzer/example_multi_kernel.py | 171 +++++ .../generated_tests/test_fuzz_multi_kernel.py | 132 ++++ src/fuzzer/src/__init__.py | 16 + src/fuzzer/src/fuzzer.py | 412 +++++++++++ src/fuzzer/src/kernel_generator.py | 206 ++++++ src/fuzzer/src/multi_kernel_test_generator.py | 651 ++++++++++++++++++ src/fuzzer/src/orchestrator_generator.py | 298 ++++++++ 10 files changed, 2231 insertions(+) create mode 100644 src/fuzzer/README.md create mode 100644 src/fuzzer/__init__.py create mode 100644 src/fuzzer/conftest.py create mode 100644 src/fuzzer/example_multi_kernel.py create mode 100644 src/fuzzer/generated_tests/test_fuzz_multi_kernel.py create mode 100644 src/fuzzer/src/__init__.py create mode 100644 src/fuzzer/src/fuzzer.py create mode 100644 src/fuzzer/src/kernel_generator.py create mode 100644 src/fuzzer/src/multi_kernel_test_generator.py create mode 100644 src/fuzzer/src/orchestrator_generator.py diff --git a/src/fuzzer/README.md b/src/fuzzer/README.md new file mode 100644 index 0000000..e1b6dd0 --- /dev/null +++ b/src/fuzzer/README.md @@ -0,0 +1,263 @@ +# 多内核模糊测试框架 + +这是一个用于生成和测试多内核 PyPTO 程序的自动化框架。该框架可以随机生成多个 InCore 内核函数,并通过 Orchestration 函数以不同的模式组合它们。 + +**注意**:`src/fuzzer` 是一个独立的框架,不依赖 `src/pto_test/fuzzing`。所有必要的代码都包含在此目录中。 + +## 快速开始 + +```bash +# 生成1个测试用例 +python src/fuzzer/example_multi_kernel.py --num-cases 1 + +# 生成5个测试用例 +python src/fuzzer/example_multi_kernel.py --num-cases 5 + +# 运行测试(只生成代码) +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only + +# 查看生成的 C++ 代码 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only --save-kernels --kernels-dir=/tmp/kernels +``` + +## 目录结构 + +``` +src/fuzzer/ # 独立的模糊测试框架 +├── __init__.py # 外部接口 +├── example_multi_kernel.py # 使用示例脚本 +├── conftest.py # pytest 配置 +├── README.md # 本文档 +├── src/ # 内部实现 +│ ├── __init__.py +│ ├── fuzzer.py # OpFuzzer 核心逻辑 +│ ├── kernel_generator.py # InCore 内核生成器 +│ ├── orchestrator_generator.py # Orchestration 组合函数生成器 +│ └── multi_kernel_test_generator.py # 完整测试用例生成器 +└── generated_tests/ # 生成的测试文件目录 + └── test_fuzz_multi_kernel.py # 生成的测试文件 +``` + +## Op 组合规则 + +### 1. 操作符定义 + +操作符在 [fuzzer.py](fuzzer.py) 的 `OpFuzzer.__init__` 方法中定义。 + +**当前支持的操作**: +- **二元操作**: `block.add`, `block.sub`, `block.mul`, `block.div`, `block.maximum` +- **标量操作**: `block.adds`, `block.subs`, `block.muls`, `block.divs` +- **一元操作**: `block.sqrt`, `block.rsqrt`, `block.exp`, `block.neg`, `block.recip` + +**添加新操作**: +```python +# 在 fuzzer.py 的 OpFuzzer.__init__ 中修改 +self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS + +# 或者只使用基础操作 +basic_ops = [ + OpSpec("block.add", ["tile", "tile"], "tile", {}, lambda a, b: a + b), + OpSpec("block.sub", ["tile", "tile"], "tile", {}, lambda a, b: a - b), + OpSpec("block.mul", ["tile", "tile"], "tile", {}, lambda a, b: a * b), + OpSpec("block.div", ["tile", "tile"], "tile", {"avoid_zero": True}, lambda a, b: a / b), +] +self.ops = basic_ops +``` + +### 2. 内核生成规则 + +每个 InCore 内核包含: +- **输入**: 1-3 个 tile 张量,**支持不同维度** +- **操作链**: 1-10 个随机操作 +- **输出**: 1 个 tile 张量 + +**输入张量配置**: +- 可以指定每个内核的输入数量和维度 +- 不同内核可以有不同数量的输入(1-3个) +- 每个输入可以有不同的形状(如 128x128, 64x64, 256x256) +- 如果不指定,框架会随机生成输入配置 + +**示例配置**: +```python +# 在 example_multi_kernel.py 中配置 +{ + "name": "test_case_name", + "num_kernels": 3, + "input_shapes_list": [ + [(128, 128), (64, 64)], # kernel_0: 2个不同维度的输入 + [(128, 128), (128, 128), (256, 256)], # kernel_1: 3个不同维度的输入 + [(256, 256)], # kernel_2: 1个输入 + ], +} +``` + +**操作链生成规则**: +1. 从输入张量中随机选择操作数 +2. 随机选择一个操作符(add/sub/mul/div) +3. 执行操作并生成中间结果 +4. 中间结果可以被后续操作使用 +5. 最后一个操作的结果作为内核输出 + +**示例**: +```python +# 生成的内核代码 - 不同维度的输入 +@pl.function(type=pl.FunctionType.InCore) +def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[64, 64], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.op.block.load(a, 0, 0, 128, 128) + tile_b = pl.op.block.load(b, 0, 0, 128, 128) # 加载到输出大小 + tmp_0 = pl.op.block.add(tile_b, tile_a) # 操作1: b + a + tmp_1 = pl.op.block.mul(tmp_0, tile_a) # 操作2: tmp_0 * a + tmp_2 = pl.op.block.sub(tmp_1, tile_b) # 操作3: tmp_1 - b + return tmp_2 +``` + +### 3. 内核组合模式 + +**Sequential (顺序模式)**: +- 内核按顺序执行 +- 每个内核的输出作为下一个内核的输入 +``` +input → kernel_0 → kernel_1 → kernel_2 → output +``` + +**Branching (分支模式)**: +- 多个内核并行执行 +- 使用 merge 内核合并结果 +``` +input → kernel_0 ↘ +input → kernel_1 → merge → output +input → kernel_2 ↗ +``` + +**Mixed (混合模式)**: +- 结合顺序和分支执行 +``` +input → kernel_0 ↘ +input → kernel_1 → merge → kernel_2 → kernel_3 → output +``` + +### 4. 带参数的操作符 + +框架支持带参数的操作符(如 transpose, reduce, reshape): + +```python +# 在 fuzzer.py 中添加 +OpSpec( + "block.transpose", + ["tile"], "tile", {}, + lambda a, dims: np.transpose(a, dims), + shape_transform=lambda shapes, params: tuple(shapes[0][i] for i in params['dims']), + param_generator=lambda shapes, rng: {'dims': (1, 0)}, + requires_params=True +) +``` + +**OpSpec 参数说明**: +- `name`: 操作名称(PyPTO API) +- `input_types`: 输入类型列表 +- `output_type`: 输出类型 +- `constraints`: 约束条件(如 `avoid_zero`, `positive_only`) +- `np_equivalent`: NumPy 参考实现 +- `shape_transform`: shape 变换函数(可选) +- `param_generator`: 参数生成函数(可选) +- `requires_params`: 是否需要参数 + +## 命令行参数 + +### 生成测试用例 + +```bash +python src/fuzzer/example_multi_kernel.py [选项] + +选项: + --num-cases N 生成的测试用例数量 (1-5,默认: 1) + --output PATH 输出文件路径 + --seed N 随机种子 +``` + +### 运行测试 + +```bash +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py [选项] + +常用选项: + -v 显示详细输出 + -s 显示 print 输出 + --codegen-only 只生成代码,不执行 + --platform=PLATFORM 指定平台(如 a2a3sim) + --device=N 指定设备编号 + --save-kernels 保存生成的 C++ 代码 + --kernels-dir=DIR 指定保存目录 + --dump-passes 打印编译器优化 pass +``` + +## 生成的代码结构 + +```python +class TestFuzzSequentialSimple(PTOTestCase): + def get_name(self): + return "fuzz_sequential_simple" + + def define_tensors(self): + return [ + TensorSpec('a', [128, 128], DataType.FP32, is_input=True), + TensorSpec('b', [128, 128], DataType.FP32, is_input=True), + TensorSpec('output', [128, 128], DataType.FP32, is_output=True), + ] + + def get_program(self): + @pl.program + class Program: + @pl.function(type=pl.FunctionType.InCore) + def kernel_0(self, a, b): + # 内核实现 + pass + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator(self, a, b): + # 组合逻辑 + pass + + return Program + + def compute_expected(self, tensors, params=None): + # NumPy 参考实现 + pass +``` + +## 扩展框架 + +### 添加新操作符 + +编辑 [fuzzer.py](fuzzer.py) 的 `OpFuzzer.__init__` 方法: + +```python +# 在 OpFuzzer.__init__ 中 +self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS + +# 或者自定义操作集合 +custom_ops = [ + OpSpec("block.add", ["tile", "tile"], "tile", {}, lambda a, b: a + b), + OpSpec("block.maximum", ["tile", "tile"], "tile", {}, lambda a, b: np.maximum(a, b)), + OpSpec("block.sqrt", ["tile"], "tile", {"positive_only": True}, lambda a: np.sqrt(a)), +] +self.ops = custom_ops +``` + +### 添加新组合模式 + +在 [orchestrator_generator.py](orchestrator_generator.py) 中添加新的生成方法。 + +## 注意事项 + +1. **张量形状**: 支持不同维度的输入张量,可以在配置中指定每个内核的输入形状 +2. **数据类型**: 当前仅支持 FP32 类型 +3. **操作约束**: 框架自动处理除零、负数开方等约束 +4. **ISA 支持**: 确保添加的操作在目标硬件的 ISA 中有对应实现 +5. **输入数量**: 每个内核支持 1-3 个输入张量,可以在配置中指定 + +## 参考文件 + +- [tests/test_cases/test_matmul.py](../../tests/test_cases/test_matmul.py): PTOTestCase 使用模式 +- [src/fuzzer/src/fuzzer.py](src/fuzzer.py): OpFuzzer 操作生成逻辑和操作符定义 +- [example_multi_kernel.py](example_multi_kernel.py): 配置示例,包括如何指定不同维度的输入 diff --git a/src/fuzzer/__init__.py b/src/fuzzer/__init__.py new file mode 100644 index 0000000..087858e --- /dev/null +++ b/src/fuzzer/__init__.py @@ -0,0 +1,39 @@ +""" +Multi-kernel fuzzing framework for PyPTO programs. + +This is the main entry point for the fuzzer framework. +External users should import from this module. + +Example: + from fuzzer import OpFuzzer, MultiKernelTestGenerator + + # Create a test generator + generator = MultiKernelTestGenerator(seed=42) + + # Generate a test case + test_code = generator.generate_test_case( + class_name="TestMyFuzz", + num_kernels=3, + ops_per_kernel=(2, 5), + composition_style="sequential" + ) +""" + +# Import from internal src module +from .src import ( + OpFuzzer, + OpSpec, + KernelGenerator, + OrchestratorGenerator, + MultiKernelTestGenerator, +) + +__all__ = [ + "OpFuzzer", + "OpSpec", + "KernelGenerator", + "OrchestratorGenerator", + "MultiKernelTestGenerator", +] + +__version__ = "1.0.0" diff --git a/src/fuzzer/conftest.py b/src/fuzzer/conftest.py new file mode 100644 index 0000000..1e9ef67 --- /dev/null +++ b/src/fuzzer/conftest.py @@ -0,0 +1,43 @@ +""" +pytest configuration for generated multi-kernel fuzz tests. + +This conftest imports all fixtures from the main tests/conftest.py +to ensure generated tests have access to the same CLI options and fixtures. +""" + +import sys +from pathlib import Path + +# Add framework root to path +_FRAMEWORK_ROOT = Path(__file__).parent.parent.parent.parent +_TESTS_DIR = _FRAMEWORK_ROOT / "tests" + +if str(_TESTS_DIR) not in sys.path: + sys.path.insert(0, str(_TESTS_DIR)) + +# Import all fixtures and configuration from main conftest +from tests.conftest import ( + pytest_addoption, + pytest_configure, + pytest_collection_modifyitems, + test_config, + test_runner, + optimization_strategy, + fuzz_count, + fuzz_seed, + tensor_shape, + STANDARD_SHAPES, +) + +__all__ = [ + 'pytest_addoption', + 'pytest_configure', + 'pytest_collection_modifyitems', + 'test_config', + 'test_runner', + 'optimization_strategy', + 'fuzz_count', + 'fuzz_seed', + 'tensor_shape', + 'STANDARD_SHAPES', +] diff --git a/src/fuzzer/example_multi_kernel.py b/src/fuzzer/example_multi_kernel.py new file mode 100644 index 0000000..d3fac95 --- /dev/null +++ b/src/fuzzer/example_multi_kernel.py @@ -0,0 +1,171 @@ +""" +多内核模糊测试框架使用示例 + +该脚本演示如何使用多内核测试生成器创建测试用例。 +支持通过命令行参数控制生成的测试用例数量和配置。 + +使用方法: + python example_multi_kernel.py --num-cases 5 +""" + +import argparse +import sys +from pathlib import Path + +# 添加当前目录到路径 +_SCRIPT_DIR = Path(__file__).parent +if str(_SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(_SCRIPT_DIR)) + +from src.multi_kernel_test_generator import MultiKernelTestGenerator + + +def main(): + """主函数""" + parser = argparse.ArgumentParser( + description="生成多内核模糊测试用例", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + # 生成默认的5个测试用例 + python example_multi_kernel.py + + # 生成3个测试用例 + python example_multi_kernel.py --num-cases 3 + + # 指定输出文件 + python example_multi_kernel.py --output custom_test.py + """ + ) + + parser.add_argument( + "--num-cases", + type=int, + default=1, + choices=range(1, 6), + metavar="N", + help="生成的测试用例数量 (1-5,默认: 5)" + ) + + parser.add_argument( + "--output", + type=str, + default=None, + help="输出文件路径 (默认: src/fuzzer/generated_tests/test_fuzz_multi_kernel.py)" + ) + + parser.add_argument( + "--seed", + type=int, + default=4, + help="随机种子,用于可重现性 (默认: 42)" + ) + + args = parser.parse_args() + + # 设置输出路径 + if args.output: + output_path = args.output + else: + output_path = str(_SCRIPT_DIR / "generated_tests" / "test_fuzz_multi_kernel.py") + + print(f"多内核模糊测试生成器") + print(f"=" * 60) + print(f"测试用例数量: {args.num_cases}") + print(f"随机种子: {args.seed}") + print(f"输出文件: {output_path}") + print(f"=" * 60) + print() + + # 定义5种不同配置的测试用例 + all_configs = [ + { + "name": "fuzz_sequential_simple", + "num_kernels": 2, + "mode": "sequential", + "shape": (128, 128), + "num_ops_range": (3, 5), + "input_shapes_list": [ + [(128, 128), (64, 64)], # kernel_0: 2个不同维度的输入 + [(128, 128), (128, 128), (256, 256)], # kernel_1: 3个不同维度的输入 + ], + "description": "简单顺序执行:2个内核,不同维度输入" + }, + { + "name": "fuzz_branching_parallel", + "num_kernels": 3, + "mode": "branching", + "shape": (128, 128), + "num_ops_range": (4, 6), + "input_shapes_list": [ + [(128, 128), (128, 128)], # kernel_0: 2个相同维度 + [(64, 64), (128, 128)], # kernel_1: 2个不同维度 + [(256, 256)], # kernel_2: 1个输入 + ], + "description": "分支并行执行:3个内核,不同输入数量" + }, + { + "name": "fuzz_mixed_complex", + "num_kernels": 4, + "mode": "mixed", + "shape": (128, 128), + "num_ops_range": (5, 8), + "input_shapes_list": None, # 使用随机生成 + "description": "混合模式:前2个并行,后2个顺序,随机输入" + }, + { + "name": "fuzz_sequential_deep", + "num_kernels": 5, + "mode": "sequential", + "shape": (128, 128), + "num_ops_range": (6, 10), + "input_shapes_list": None, # 使用随机生成 + "description": "深度顺序执行:5个内核链式调用,随机输入" + }, + { + "name": "fuzz_branching_wide", + "num_kernels": 4, + "mode": "branching", + "shape": (128, 128), + "num_ops_range": (4, 7), + "input_shapes_list": [ + [(128, 128), (64, 64), (256, 256)], # kernel_0: 3个不同维度 + [(128, 128)], # kernel_1: 1个输入 + [(64, 64), (64, 64)], # kernel_2: 2个相同维度 + [(256, 256), (128, 128)], # kernel_3: 2个不同维度 + ], + "description": "宽分支执行:4个内核,多样化输入配置" + }, + ] + + # 根据 num_cases 选择配置 + selected_configs = all_configs[:args.num_cases] + + print("将生成以下测试用例:") + print() + for i, config in enumerate(selected_configs, 1): + print(f"{i}. {config['name']}") + print(f" {config['description']}") + print() + + # 创建生成器 + generator = MultiKernelTestGenerator(seed=args.seed) + + # 生成测试文件 + print("正在生成测试文件...") + generator.generate_test_file( + output_path=output_path, + test_configs=selected_configs, + ) + + print() + print(f"✓ 成功生成 {args.num_cases} 个测试用例") + print(f"✓ 输出文件: {output_path}") + print() + print("运行测试:") + print(f" pytest {output_path}") + print() + + +if __name__ == "__main__": + main() diff --git a/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py new file mode 100644 index 0000000..0d5c1bf --- /dev/null +++ b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py @@ -0,0 +1,132 @@ +""" +自动生成的多内核模糊测试用例 + +该文件由 MultiKernelTestGenerator 自动生成。 +包含多个测试用例,每个测试用例包含多个 InCore 内核和一个 Orchestration 函数。 +""" + +import sys +from pathlib import Path +from typing import Any, List + +import numpy as np +import pytest + +from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec + +# 添加 pypto 到路径 +_FRAMEWORK_ROOT = Path(__file__).parent.parent.parent.parent +_PYPTO_ROOT = _FRAMEWORK_ROOT / "3rdparty" / "pypto" / "python" +if _PYPTO_ROOT.exists() and str(_PYPTO_ROOT) not in sys.path: + sys.path.insert(0, str(_PYPTO_ROOT)) + + +class TestFuzzSequentialSimple(PTOTestCase): + """ + 测试用例: fuzz_sequential_simple + 组合模式: sequential + 内核数量: 2 + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.rows = 128 + self.cols = 128 + + def get_name(self) -> str: + return 'fuzz_sequential_simple' + + def define_tensors(self) -> List[TensorSpec]: + return [ + TensorSpec('a', [128, 128], DataType.FP32, init_value=2.0), + TensorSpec('b', [128, 128], DataType.FP32, init_value=2.5), + TensorSpec('c', [256, 256], DataType.FP32, init_value=3.0), + TensorSpec('output', [128, 128], DataType.FP32, is_output=True), + ] + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class FuzzSequentialSimpleProgram: + @pl.function(type=pl.FunctionType.InCore) + def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.op.block.load(a, 0, 0, 128, 128) + tile_b = pl.op.block.load(b, 0, 0, 128, 128) + tmp_0 = pl.op.block.div(tile_b, tile_a) + tmp_1 = pl.op.block.sub(tmp_0, tile_a) + tmp_2 = pl.op.block.div(tmp_1, tile_b) + return tmp_2 + + @pl.function(type=pl.FunctionType.InCore) + def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[256, 256], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.op.block.load(a, 0, 0, 128, 128) + tile_b = pl.op.block.load(b, 0, 0, 128, 128) + tile_c = pl.op.block.load(c, 0, 0, 128, 128) + tmp_0 = pl.op.block.add(tile_a, tile_c) + tmp_1 = pl.op.block.neg(tile_b) + tmp_2 = pl.op.block.maximum(tmp_1, tmp_1) + tmp_3 = pl.op.block.rsqrt(tmp_0) + tmp_4 = pl.op.block.add(tmp_2, tmp_3) + return tmp_4 + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[256, 256], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + result_0 = self.kernel_0(a, b) + result_1 = self.kernel_1(result_0, b, c) + return result_1 + + return FuzzSequentialSimpleProgram + + def compute_expected(self, tensors, params=None): + """使用 NumPy 计算期望输出""" + def _numpy_kernel_0(self, a, b): + """NumPy 实现: kernel_0""" + # 创建变量环境 + env = {} + env['tile_a'] = a.copy() + env['tile_b'] = b.copy() + + # 执行操作链 + env['tile_b'] = np.where(np.abs(env['tile_b']) < 0.01, 1.0, env['tile_b']) + env['tile_a'] = np.where(np.abs(env['tile_a']) < 0.01, 1.0, env['tile_a']) + env['tmp_0'] = env['tile_b'] / env['tile_a'] + env['tmp_1'] = env['tmp_0'] - env['tile_a'] + env['tmp_1'] = np.where(np.abs(env['tmp_1']) < 0.01, 1.0, env['tmp_1']) + env['tile_b'] = np.where(np.abs(env['tile_b']) < 0.01, 1.0, env['tile_b']) + env['tmp_2'] = env['tmp_1'] / env['tile_b'] + return env['tmp_2'] + + def _numpy_kernel_1(self, a, b, c): + """NumPy 实现: kernel_1""" + # 创建变量环境 + env = {} + env['tile_a'] = a.copy() + env['tile_b'] = b.copy() + env['tile_c'] = c.copy() + + # 执行操作链 + env['tmp_0'] = env['tile_a'] + env['tile_c'] + env['tmp_1'] = -env['tile_b'] + env['tmp_2'] = np.maximum(env['tmp_1'], env['tmp_1']) + env['tmp_0'] = np.abs(env['tmp_0']) + 1e-6 + env['tmp_3'] = 1.0 / np.sqrt(env['tmp_0']) + env['tmp_4'] = env['tmp_2'] + env['tmp_3'] + return env['tmp_4'] + + + # 顺序执行模式 + result_0 = self._numpy_kernel_0(tensors['a'], tensors['b']) + result_1 = self._numpy_kernel_1(result_0, tensors['b'], tensors['c']) + tensors['output'][:] = result_1 + + +class TestMultiKernelFuzzing: + """多内核模糊测试套件""" + + def test_fuzz_sequential_simple(self, test_runner): + """测试 fuzz_sequential_simple""" + test_case = TestFuzzSequentialSimple() + result = test_runner.run(test_case) + assert result.passed, f"测试失败: {result.error}" + diff --git a/src/fuzzer/src/__init__.py b/src/fuzzer/src/__init__.py new file mode 100644 index 0000000..5cefd5b --- /dev/null +++ b/src/fuzzer/src/__init__.py @@ -0,0 +1,16 @@ +""" +Internal implementation modules for the fuzzer framework. +""" + +from .fuzzer import OpFuzzer, OpSpec +from .kernel_generator import KernelGenerator +from .orchestrator_generator import OrchestratorGenerator +from .multi_kernel_test_generator import MultiKernelTestGenerator + +__all__ = [ + "OpFuzzer", + "OpSpec", + "KernelGenerator", + "OrchestratorGenerator", + "MultiKernelTestGenerator", +] diff --git a/src/fuzzer/src/fuzzer.py b/src/fuzzer/src/fuzzer.py new file mode 100644 index 0000000..4e5b3bd --- /dev/null +++ b/src/fuzzer/src/fuzzer.py @@ -0,0 +1,412 @@ +""" +Operator fuzzer for generating random operator combinations. +""" + +import random +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np # Used in lambda functions for op equivalents + + +@dataclass +class OpSpec: + """Operator specification for fuzzing. + + Attributes: + name: Operator name (e.g., "block.add") + input_types: List of input types (e.g., ["tile", "tile"]) + output_type: Output type (e.g., "tile") + constraints: Additional constraints (e.g., {"min_shape": [64, 64]}) + np_equivalent: NumPy equivalent function for golden reference + shape_transform: Optional callable that computes output shape from input shapes + param_generator: Optional callable that generates operator parameters + requires_params: Whether this operator requires parameters (default: False) + """ + name: str + input_types: List[str] + output_type: str + constraints: Dict[str, Any] + np_equivalent: Optional[Any] = None + shape_transform: Optional[Any] = None + param_generator: Optional[Any] = None + requires_params: bool = False + + def compute_output_shape(self, input_shapes: List[Tuple[int, int]], params: Optional[Dict[str, Any]] = None) -> Tuple[int, int]: + """Compute output shape from input shapes.""" + if self.shape_transform: + import inspect + sig = inspect.signature(self.shape_transform) + if len(sig.parameters) >= 2 and params is not None: + return self.shape_transform(input_shapes, params) + else: + return self.shape_transform(input_shapes) + return input_shapes[0] if input_shapes else (128, 128) + + def generate_params(self, input_shapes: List[Tuple[int, int]], rng) -> Dict[str, Any]: + """Generate operator parameters based on input shapes.""" + if self.param_generator and self.requires_params: + return self.param_generator(input_shapes, rng) + return {} + + +class OpFuzzer: + """Generates random operator combinations for fuzzing.""" + + # Block-level binary operators + BLOCK_BINARY_OPS = [ + OpSpec("block.add", ["tile", "tile"], "tile", {}, lambda a, b: a + b), + OpSpec("block.sub", ["tile", "tile"], "tile", {}, lambda a, b: a - b), + OpSpec("block.mul", ["tile", "tile"], "tile", {}, lambda a, b: a * b), + OpSpec("block.div", ["tile", "tile"], "tile", {"avoid_zero": True}, lambda a, b: a / b), + OpSpec("block.maximum", ["tile", "tile"], "tile", {}, lambda a, b: np.maximum(a, b)), + ] + + # Block-level scalar operators + BLOCK_SCALAR_OPS = [ + OpSpec("block.adds", ["tile", "scalar"], "tile", {}, lambda a, s: a + s), + OpSpec("block.subs", ["tile", "scalar"], "tile", {}, lambda a, s: a - s), + OpSpec("block.muls", ["tile", "scalar"], "tile", {}, lambda a, s: a * s), + OpSpec("block.divs", ["tile", "scalar"], "tile", {"avoid_zero": True}, lambda a, s: a / s), + ] + + # Block-level unary operators + BLOCK_UNARY_OPS = [ + OpSpec("block.sqrt", ["tile"], "tile", {"positive_only": True}, lambda a: np.sqrt(a)), + OpSpec("block.rsqrt", ["tile"], "tile", {"positive_only": True}, lambda a: 1.0 / np.sqrt(a)), + OpSpec("block.exp", ["tile"], "tile", {}, lambda a: np.exp(np.clip(a, -10, 10))), + OpSpec("block.neg", ["tile"], "tile", {}, lambda a: -a), + OpSpec("block.recip", ["tile"], "tile", {"avoid_zero": True}, lambda a: 1.0 / a), + ] + + def __init__(self, seed: Optional[int] = None): + """Initialize fuzzer with optional seed for reproducibility.""" + self.rng = random.Random(seed) + # 使用所有操作符 + self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS + + def generate_op_chain( + self, + num_ops: int = 5, + input_count: int = 2, + allow_scalars: bool = True, + track_shapes: bool = False, + default_shape: Tuple[int, int] = (128, 128), + ) -> List[Dict[str, Any]]: + """Generate a chain of operator calls. + + All input tensors and intermediate results are guaranteed to contribute + to the final output through smart generation and post-processing. + """ + # Initialize available variables + available_tiles = [f"tile_{chr(97 + i)}" for i in range(input_count)] + available_scalars = ["1.0", "2.0", "0.5"] if allow_scalars else [] + + # Track which initial inputs have been used + initial_inputs = set(available_tiles) + used_inputs = set() + + # Track usage count for each variable + variable_usage_count = {tile: 0 for tile in available_tiles} + + # Shape tracking (optional) + variable_shapes = {} + if track_shapes: + for tile in available_tiles: + variable_shapes[tile] = default_shape + + operations = [] + + for i in range(num_ops): + # Calculate urgency for using unused inputs + unused_count = len(initial_inputs - used_inputs) + remaining_ops = num_ops - i + + # Dynamic priority + use_unused_priority = 0.7 + if unused_count > 0: + if unused_count >= remaining_ops: + use_unused_priority = 1.0 + elif remaining_ops > 0: + use_unused_priority = min(0.9, 0.7 + 0.3 * (unused_count / remaining_ops)) + + # Select eligible operators + eligible_ops = self._get_eligible_ops( + available_tiles, + available_scalars, + allow_scalars, + variable_shapes if track_shapes else None, + ) + + if not eligible_ops: + break + + # Prioritize binary ops if we need to use unused inputs + if unused_count > 0 and use_unused_priority >= 0.9: + binary_ops = [op for op in eligible_ops if sum(1 for t in op.input_types if t == "tile") >= 2] + if binary_ops: + eligible_ops = binary_ops + + op = self.rng.choice(eligible_ops) + + # Select inputs + inputs = [] + scalar_value = None + + for input_type in op.input_types: + if input_type == "tile": + candidate_tiles = available_tiles + + if track_shapes: + candidate_tiles = [ + t for t in candidate_tiles + if self._is_shape_compatible(op, t, variable_shapes) + ] + if not candidate_tiles: + continue + + # Smart selection: prioritize unused inputs + unused_initial_inputs = { + t for t in candidate_tiles + if t in initial_inputs and t not in used_inputs + } + + candidate_scores = [] + for t in candidate_tiles: + score = 0 + + if t in unused_initial_inputs: + score += 50 + if use_unused_priority >= 0.9: + score += 30 + + usage = variable_usage_count.get(t, 0) + score += max(0, 20 - usage * 5) + + if t.startswith("tmp_"): + score += 5 + + candidate_scores.append((t, score)) + + if candidate_scores: + max_score = max(score for _, score in candidate_scores) + + if max_score >= 40: + threshold = max(max_score * 0.6, 30) + top_candidates = [t for t, score in candidate_scores if score >= threshold] + + if top_candidates and self.rng.random() < 0.85: + candidate_tiles = top_candidates + else: + min_score_needed = max(max_score * 0.7, 10) + preferred = [t for t, score in candidate_scores if score >= min_score_needed] + if preferred and self.rng.random() < 0.75: + candidate_tiles = preferred + + selected_input = self.rng.choice(candidate_tiles) + inputs.append(selected_input) + + variable_usage_count[selected_input] = variable_usage_count.get(selected_input, 0) + 1 + + if selected_input in initial_inputs: + used_inputs.add(selected_input) + + elif input_type == "scalar": + if self.rng.random() < 0.5 and available_scalars: + scalar_value = self.rng.choice(available_scalars) + else: + scalar_value = f"{self.rng.uniform(0.1, 10.0):.2f}" + inputs.append(scalar_value) + + output = f"tmp_{i}" + + # Generate operator parameters if required + params = None + if op.requires_params: + input_shapes = [variable_shapes[inp] for inp in inputs if inp in variable_shapes] + if input_shapes: + params = op.generate_params(input_shapes, self.rng) + + op_dict = { + "op": op, + "inputs": inputs, + "output": output, + "scalar_value": scalar_value, + "params": params, + } + + # Compute output shape if tracking + if track_shapes: + input_shapes = [variable_shapes[inp] for inp in inputs if inp in variable_shapes] + output_shape = op.compute_output_shape(input_shapes, params) + op_dict["output_shape"] = output_shape + variable_shapes[output] = output_shape + + operations.append(op_dict) + available_tiles.append(output) + variable_usage_count[output] = 0 + + # Ensure all initial inputs are used + unused_inputs = initial_inputs - used_inputs + if unused_inputs: + add_op = next((op for op in self.BLOCK_BINARY_OPS if op.name == "block.add"), None) + + for unused_input in unused_inputs: + if operations: + current_final = operations[-1]["output"] + output = f"tmp_{len(operations)}" + + op_dict = { + "op": add_op, + "inputs": [unused_input, current_final], + "output": output, + "scalar_value": None, + "params": None, + } + + if track_shapes: + input_shapes = [ + variable_shapes.get(unused_input, default_shape), + variable_shapes.get(current_final, default_shape) + ] + output_shape = add_op.compute_output_shape(input_shapes) + op_dict["output_shape"] = output_shape + variable_shapes[output] = output_shape + + operations.append(op_dict) + available_tiles.append(output) + used_inputs.add(unused_input) + variable_usage_count[output] = 0 + variable_usage_count[unused_input] = variable_usage_count.get(unused_input, 0) + 1 + variable_usage_count[current_final] = variable_usage_count.get(current_final, 0) + 1 + + # Ensure all intermediate results contribute to the final output + if operations: + final_output = operations[-1]["output"] + unused_intermediates = [] + + for var_name, usage_count in variable_usage_count.items(): + if var_name.startswith("tmp_") and usage_count == 0 and var_name != final_output: + unused_intermediates.append(var_name) + + if unused_intermediates: + add_op = next((op for op in self.BLOCK_BINARY_OPS if op.name == "block.add"), None) + + for unused_var in unused_intermediates: + current_final = operations[-1]["output"] + output = f"tmp_{len(operations)}" + + op_dict = { + "op": add_op, + "inputs": [unused_var, current_final], + "output": output, + "scalar_value": None, + "params": None, + } + + if track_shapes: + input_shapes = [ + variable_shapes.get(unused_var, default_shape), + variable_shapes.get(current_final, default_shape) + ] + output_shape = add_op.compute_output_shape(input_shapes) + op_dict["output_shape"] = output_shape + variable_shapes[output] = output_shape + + operations.append(op_dict) + available_tiles.append(output) + variable_usage_count[output] = 0 + variable_usage_count[unused_var] = variable_usage_count.get(unused_var, 0) + 1 + variable_usage_count[current_final] = variable_usage_count.get(current_final, 0) + 1 + + return operations + + def _get_eligible_ops( + self, + available_tiles: List[str], + available_scalars: List[str], + allow_scalars: bool, + variable_shapes: Optional[Dict[str, Tuple[int, int]]] = None, + ) -> List[OpSpec]: + """Get operators that can be applied with current variables.""" + eligible = [] + + for op in self.ops: + tile_inputs = sum(1 for t in op.input_types if t == "tile") + scalar_inputs = sum(1 for t in op.input_types if t == "scalar") + + has_tiles = len(available_tiles) >= tile_inputs + has_scalars = (scalar_inputs == 0) or (allow_scalars and + (len(available_scalars) >= scalar_inputs or scalar_inputs > 0)) + + if has_tiles and has_scalars: + eligible.append(op) + + return eligible + + def _is_shape_compatible( + self, + op: OpSpec, + var: str, + variable_shapes: Dict[str, Tuple[int, int]] + ) -> bool: + """Check if a variable's shape is compatible with an operator.""" + if var not in variable_shapes: + return True + return True # All current ops are compatible with any shape + + def generate_numpy_reference( + self, + op_chain: List[Dict[str, Any]], + input_tensors: Dict[str, Any], + ) -> Any: + """Generate NumPy golden reference from operation chain.""" + import numpy as np + + # Create variable environment + env = {} + for name, tensor in input_tensors.items(): + env[f"tile_{name}"] = tensor.copy() + + # Execute operations + for op_dict in op_chain: + op = op_dict["op"] + inputs = op_dict["inputs"] + output = op_dict["output"] + params = op_dict.get("params") + + # Get input values + input_vals = [] + for inp in inputs: + if inp in env: + val = env[inp] + else: + val = float(inp) + input_vals.append(val) + + # Apply constraints + if "avoid_zero" in op.constraints and op.constraints["avoid_zero"]: + for i, val in enumerate(input_vals): + if isinstance(val, np.ndarray): + input_vals[i] = np.where(np.abs(val) < 0.01, 1.0, val) + + if "positive_only" in op.constraints and op.constraints["positive_only"]: + for i, val in enumerate(input_vals): + if isinstance(val, np.ndarray): + input_vals[i] = np.abs(val) + 1e-6 + + # Execute operation + if op.np_equivalent: + import inspect + sig = inspect.signature(op.np_equivalent) + if params and len(sig.parameters) > len(input_vals): + result = op.np_equivalent(*input_vals, params) + else: + result = op.np_equivalent(*input_vals) + env[output] = result + + # Return final result + if op_chain: + return env[op_chain[-1]["output"]] + else: + return input_tensors[list(input_tensors.keys())[0]] diff --git a/src/fuzzer/src/kernel_generator.py b/src/fuzzer/src/kernel_generator.py new file mode 100644 index 0000000..9a04870 --- /dev/null +++ b/src/fuzzer/src/kernel_generator.py @@ -0,0 +1,206 @@ +""" +InCore 内核函数生成器 + +该模块负责生成 @pl.function(type=pl.FunctionType.InCore) 内核函数。 +每个内核包含一系列随机生成的算子操作链。 +""" + +import random +from typing import Any, Dict, List, Optional, Tuple + +from .fuzzer import OpFuzzer + + +class KernelGenerator: + """生成 InCore 内核函数的生成器""" + + def __init__(self, seed: Optional[int] = None): + """初始化内核生成器 + + Args: + seed: 随机种子,用于可重现性 + """ + self.rng = random.Random(seed) + self.fuzzer = OpFuzzer(seed=seed) + + def generate_kernel( + self, + kernel_name: str, + num_inputs: int = 2, + num_ops: int = 5, + shape: Tuple[int, int] = (128, 128), + allow_scalars: bool = True, + input_shapes: Optional[List[Tuple[int, int]]] = None, + output_shape: Optional[Tuple[int, int]] = None, + ) -> Dict[str, Any]: + """生成单个 InCore 内核 + + Args: + kernel_name: 内核函数名称 + num_inputs: 输入张量数量(如果未指定 input_shapes) + num_ops: 操作数量 + shape: 默认张量形状(如果未指定 input_shapes) + allow_scalars: 是否允许标量操作 + input_shapes: 每个输入的形状列表,如果指定则覆盖 num_inputs 和 shape + output_shape: 输出形状,如果指定则覆盖默认行为 + + Returns: + 包含内核信息的字典: + - name: 内核名称 + - inputs: 输入参数列表 [(name, shape), ...] + - output_shape: 输出形状 + - op_chain: 操作链 + - code: 生成的 PyPTO 代码 + """ + # 确定输入形状 + if input_shapes is not None: + actual_num_inputs = len(input_shapes) + actual_shapes = input_shapes + else: + actual_num_inputs = num_inputs + actual_shapes = [shape] * num_inputs + + # 生成操作链 + op_chain = self.fuzzer.generate_op_chain( + num_ops=num_ops, + input_count=actual_num_inputs, + allow_scalars=allow_scalars, + track_shapes=False, + default_shape=shape, + ) + + # 生成输入参数 + input_names = [chr(97 + i) for i in range(actual_num_inputs)] # a, b, c, ... + inputs = [(name, actual_shapes[i]) for i, name in enumerate(input_names)] + + # 确定输出形状:优先使用指定的 output_shape,否则使用第一个输入的形状 + if output_shape is not None: + actual_output_shape = output_shape + else: + actual_output_shape = actual_shapes[0] + + # 生成内核代码 + code = self._generate_kernel_code( + kernel_name=kernel_name, + inputs=inputs, + op_chain=op_chain, + output_shape=actual_output_shape, + ) + + return { + "name": kernel_name, + "inputs": inputs, + "output_shape": actual_output_shape, + "op_chain": op_chain, + "code": code, + } + + def _generate_kernel_code( + self, + kernel_name: str, + inputs: List[Tuple[str, Tuple[int, int]]], + op_chain: List[Dict[str, Any]], + output_shape: Tuple[int, int], + ) -> str: + """生成内核函数代码 + + Args: + kernel_name: 内核名称 + inputs: 输入参数列表 + op_chain: 操作链 + output_shape: 输出形状 + + Returns: + 生成的 PyPTO 代码字符串 + """ + rows, cols = output_shape + + # 生成函数签名 + params = [] + for name, (r, c) in inputs: + params.append(f"{name}: pl.Tensor[[{r}, {c}], pl.FP32]") + + code_lines = [ + f" @pl.function(type=pl.FunctionType.InCore)", + f" def {kernel_name}(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", + ] + + # 加载输入张量 - 使用输出形状作为加载大小 + for name, (r, c) in inputs: + code_lines.append(f" tile_{name} = pl.op.block.load({name}, 0, 0, {rows}, {cols})") + + # 生成操作链 + for op_dict in op_chain: + op = op_dict["op"] + inputs_str = ", ".join(op_dict["inputs"]) + output = op_dict["output"] + params = op_dict.get("params") + + if params: + params_str = ", ".join(f"{k}={v}" for k, v in params.items()) + code_lines.append(f" {output} = pl.op.{op.name}({inputs_str}, {params_str})") + else: + code_lines.append(f" {output} = pl.op.{op.name}({inputs_str})") + + # 返回最终结果 + if op_chain: + last_output = op_chain[-1]["output"] + code_lines.append(f" return {last_output}") + else: + # 如果没有操作,返回第一个输入 + first_input = inputs[0][0] + code_lines.append(f" return tile_{first_input}") + + return "\n".join(code_lines) + + def generate_multiple_kernels( + self, + num_kernels: int = 3, + num_inputs_range: Tuple[int, int] = (2, 3), + num_ops_range: Tuple[int, int] = (3, 7), + shape: Tuple[int, int] = (128, 128), + input_shapes_list: Optional[List[List[Tuple[int, int]]]] = None, + output_shapes: Optional[List[Tuple[int, int]]] = None, + ) -> List[Dict[str, Any]]: + """生成多个 InCore 内核 + + Args: + num_kernels: 要生成的内核数量 + num_inputs_range: 输入数量范围 (min, max) + num_ops_range: 操作数量范围 (min, max) + shape: 默认张量形状 + input_shapes_list: 每个内核的输入形状列表,如果指定则覆盖其他参数 + 例如: [[(128,128), (64,64)], [(256,256)], ...] + output_shapes: 每个内核的输出形状列表(可选) + + Returns: + 内核信息字典列表 + """ + kernels = [] + for i in range(num_kernels): + num_ops = self.rng.randint(*num_ops_range) + + # 确定输入形状 + if input_shapes_list and i < len(input_shapes_list): + kernel_input_shapes = input_shapes_list[i] + kernel_output_shape = output_shapes[i] if output_shapes and i < len(output_shapes) else None + kernel = self.generate_kernel( + kernel_name=f"kernel_{i}", + num_ops=num_ops, + shape=shape, + input_shapes=kernel_input_shapes, + output_shape=kernel_output_shape, + ) + else: + num_inputs = self.rng.randint(*num_inputs_range) + kernel_output_shape = output_shapes[i] if output_shapes and i < len(output_shapes) else None + kernel = self.generate_kernel( + kernel_name=f"kernel_{i}", + num_inputs=num_inputs, + num_ops=num_ops, + shape=shape, + output_shape=kernel_output_shape, + ) + kernels.append(kernel) + + return kernels diff --git a/src/fuzzer/src/multi_kernel_test_generator.py b/src/fuzzer/src/multi_kernel_test_generator.py new file mode 100644 index 0000000..1c142bf --- /dev/null +++ b/src/fuzzer/src/multi_kernel_test_generator.py @@ -0,0 +1,651 @@ +""" +多内核测试用例生成器 + +该模块负责生成完整的测试用例,包括: +- 多个 InCore 内核 +- Orchestration 组合函数 +- NumPy 参考实现 +- PTOTestCase 测试类 +""" + +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from .fuzzer import OpFuzzer + +from .kernel_generator import KernelGenerator +from .orchestrator_generator import OrchestratorGenerator + + +class MultiKernelTestGenerator: + """生成多内核测试用例的生成器""" + + def __init__(self, seed: Optional[int] = None): + """初始化测试生成器 + + Args: + seed: 随机种子,用于可重现性 + """ + self.seed = seed + self.kernel_gen = KernelGenerator(seed=seed) + self.orch_gen = OrchestratorGenerator(seed=seed) + self.fuzzer = OpFuzzer(seed=seed) + + def _compute_output_shapes_for_sequential( + self, + num_kernels: int, + default_shape: Tuple[int, int], + input_shapes_list: Optional[List[List[Tuple[int, int]]]], + mode: str, + ) -> List[Tuple[int, int]]: + """计算顺序模式下每个内核的输出形状,确保形状兼容性 + + Args: + num_kernels: 内核数量 + default_shape: 默认形状 + input_shapes_list: 输入形状列表 + mode: 组合模式 + + Returns: + 每个内核的输出形状列表 + """ + output_shapes = [] + + if mode == "sequential": + # 顺序模式:kernel_i 的输出必须匹配 kernel_{i+1} 的第一个输入 + for i in range(num_kernels): + if i == num_kernels - 1: + # 最后一个内核:输出形状使用其第一个输入的形状 + if input_shapes_list and i < len(input_shapes_list): + output_shapes.append(input_shapes_list[i][0]) + else: + output_shapes.append(default_shape) + else: + # 非最后一个内核:输出形状必须匹配下一个内核的第一个输入 + if input_shapes_list and i + 1 < len(input_shapes_list): + next_kernel_first_input = input_shapes_list[i + 1][0] + output_shapes.append(next_kernel_first_input) + else: + output_shapes.append(default_shape) + + elif mode == "branching": + # 分支模式:所有内核必须有相同的输出形状(用于合并) + # 使用第一个内核的第一个输入形状作为统一输出形状 + if input_shapes_list and len(input_shapes_list) > 0: + unified_output_shape = input_shapes_list[0][0] + else: + unified_output_shape = default_shape + + for i in range(num_kernels): + output_shapes.append(unified_output_shape) + + elif mode == "mixed": + # 混合模式:前半部分并行,后半部分顺序 + mid = num_kernels // 2 + + # 并行部分:所有内核使用相同的输出形状 + if input_shapes_list and len(input_shapes_list) > 0: + parallel_output_shape = input_shapes_list[0][0] + else: + parallel_output_shape = default_shape + + for i in range(num_kernels): + if i < mid: + # 并行部分:统一输出形状 + output_shapes.append(parallel_output_shape) + elif i == mid: + # 第一个顺序内核:输出形状匹配下一个内核的第一个输入(如果有) + if i == num_kernels - 1: + # 如果是最后一个,使用其第一个输入的形状 + if input_shapes_list and i < len(input_shapes_list): + output_shapes.append(input_shapes_list[i][0]) + else: + output_shapes.append(default_shape) + else: + # 匹配下一个内核的第一个输入 + if input_shapes_list and i + 1 < len(input_shapes_list): + output_shapes.append(input_shapes_list[i + 1][0]) + else: + output_shapes.append(default_shape) + else: + # 后续顺序内核 + if i == num_kernels - 1: + # 最后一个内核 + if input_shapes_list and i < len(input_shapes_list): + output_shapes.append(input_shapes_list[i][0]) + else: + output_shapes.append(default_shape) + else: + # 匹配下一个内核的第一个输入 + if input_shapes_list and i + 1 < len(input_shapes_list): + output_shapes.append(input_shapes_list[i + 1][0]) + else: + output_shapes.append(default_shape) + + return output_shapes + + def _regenerate_kernel_code_with_unified_shapes( + self, + kernel: Dict[str, Any], + input_shapes_map: Dict[str, Tuple[int, int]], + ) -> str: + """使用统一的输入形状重新生成 kernel 代码 + + Args: + kernel: 内核信息字典 + input_shapes_map: 统一的输入形状映射 + + Returns: + 重新生成的 kernel 代码 + """ + kernel_name = kernel["name"] + output_shape = kernel["output_shape"] + op_chain = kernel["op_chain"] + rows, cols = output_shape + + # 使用统一的输入形状生成函数签名 + params = [] + for inp_name, _ in kernel["inputs"]: + unified_shape = input_shapes_map[inp_name] + params.append(f"{inp_name}: pl.Tensor[[{unified_shape[0]}, {unified_shape[1]}], pl.FP32]") + + code_lines = [ + f" @pl.function(type=pl.FunctionType.InCore)", + f" def {kernel_name}(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", + ] + + # 加载输入张量 - 使用输出形状作为加载大小 + for inp_name, _ in kernel["inputs"]: + code_lines.append(f" tile_{inp_name} = pl.op.block.load({inp_name}, 0, 0, {rows}, {cols})") + + # 生成操作链 + for op_dict in op_chain: + op = op_dict["op"] + inputs_str = ", ".join(op_dict["inputs"]) + output = op_dict["output"] + params_dict = op_dict.get("params") + + if params_dict: + params_str = ", ".join(f"{k}={v}" for k, v in params_dict.items()) + code_lines.append(f" {output} = pl.op.{op.name}({inputs_str}, {params_str})") + else: + code_lines.append(f" {output} = pl.op.{op.name}({inputs_str})") + + # 返回最终结果 + if op_chain: + last_output = op_chain[-1]["output"] + code_lines.append(f" return {last_output}") + else: + # 如果没有操作,返回第一个输入 + first_input = kernel["inputs"][0][0] + code_lines.append(f" return tile_{first_input}") + + return "\n".join(code_lines) + + def generate_test_case( + self, + test_name: str, + num_kernels: int = 3, + orchestration_mode: str = "sequential", + shape: Tuple[int, int] = (128, 128), + num_ops_range: Tuple[int, int] = (3, 7), + input_shapes_list: Optional[List[List[Tuple[int, int]]]] = None, + ) -> str: + """生成完整的测试用例代码 + + Args: + test_name: 测试用例名称 + num_kernels: 内核数量 + orchestration_mode: 组合模式 ("sequential", "branching", "mixed") + shape: 张量形状 + num_ops_range: 每个内核的操作数量范围 + input_shapes_list: 每个内核的输入形状列表(可选) + + Returns: + 完整的测试用例代码字符串 + """ + # 对于 sequential、branching 和 mixed 模式,计算输出形状以确保兼容性 + if orchestration_mode in ["sequential", "branching", "mixed"]: + output_shapes = self._compute_output_shapes_for_sequential( + num_kernels, shape, input_shapes_list, orchestration_mode + ) + else: + output_shapes = None + + # 生成多个内核 + kernels = self.kernel_gen.generate_multiple_kernels( + num_kernels=num_kernels, + num_inputs_range=(2, 3), + num_ops_range=num_ops_range, + shape=shape, + input_shapes_list=input_shapes_list, + output_shapes=output_shapes, + ) + + # 生成 Orchestration 函数 + if orchestration_mode == "sequential": + orch_info = self.orch_gen.generate_sequential(kernels, shape) + elif orchestration_mode == "branching": + orch_info = self.orch_gen.generate_branching(kernels, shape) + elif orchestration_mode == "mixed": + orch_info = self.orch_gen.generate_mixed(kernels, shape) + else: + raise ValueError(f"未知的组合模式: {orchestration_mode}") + + # 生成 NumPy 参考实现 + numpy_code = self._generate_numpy_reference(kernels, orch_info) + + # 生成完整的测试类 + test_code = self._generate_test_class( + test_name=test_name, + kernels=kernels, + orch_info=orch_info, + numpy_code=numpy_code, + shape=shape, + ) + + return test_code + + def _generate_numpy_reference( + self, + kernels: List[Dict[str, Any]], + orch_info: Dict[str, Any], + ) -> str: + """生成 NumPy 参考实现代码 + + Args: + kernels: 内核信息列表 + orch_info: Orchestration 信息 + + Returns: + NumPy 参考实现代码字符串 + """ + code_lines = [] + + # 为每个内核生成 NumPy 函数 + for kernel in kernels: + kernel_name = kernel["name"] + input_names = [inp[0] for inp in kernel["inputs"]] + op_chain = kernel["op_chain"] + + code_lines.append(f" def _numpy_{kernel_name}(self, {', '.join(input_names)}):") + code_lines.append(f" \"\"\"NumPy 实现: {kernel_name}\"\"\"") + + # 生成 NumPy 操作 + code_lines.append(f" # 创建变量环境") + code_lines.append(f" env = {{}}") + for name in input_names: + code_lines.append(f" env['tile_{name}'] = {name}.copy()") + + code_lines.append(f"") + code_lines.append(f" # 执行操作链") + for op_dict in op_chain: + op = op_dict["op"] + inputs = op_dict["inputs"] + output = op_dict["output"] + + # 获取输入值 + input_vals = [] + for inp in inputs: + if inp.startswith("tile_") or inp.startswith("tmp_"): + input_vals.append(f"env['{inp}']") + else: + input_vals.append(inp) + + # 应用约束 + if "avoid_zero" in op.constraints and op.constraints["avoid_zero"]: + for i, inp in enumerate(inputs): + if inp.startswith("tile_") or inp.startswith("tmp_"): + code_lines.append(f" env['{inp}'] = np.where(np.abs(env['{inp}']) < 0.01, 1.0, env['{inp}'])") + + if "positive_only" in op.constraints and op.constraints["positive_only"]: + for i, inp in enumerate(inputs): + if inp.startswith("tile_") or inp.startswith("tmp_"): + code_lines.append(f" env['{inp}'] = np.abs(env['{inp}']) + 1e-6") + + # 生成操作 + if op.np_equivalent: + np_expr = self._get_numpy_operation(op.name, input_vals) + code_lines.append(f" env['{output}'] = {np_expr}") + + code_lines.append(f" return env['{op_chain[-1]['output']}']") + code_lines.append(f"") + + return "\n".join(code_lines) + + def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: + """将 PyPTO 操作名转换为 NumPy 操作表达式 + + Args: + op_name: PyPTO 操作名 (如 "block.add") + input_vals: 输入值列表 + + Returns: + NumPy 操作表达式字符串 + """ + # 根据操作类型生成表达式 + if op_name == "block.add": + return f"{input_vals[0]} + {input_vals[1]}" + elif op_name == "block.sub": + return f"{input_vals[0]} - {input_vals[1]}" + elif op_name == "block.mul": + return f"{input_vals[0]} * {input_vals[1]}" + elif op_name == "block.div": + return f"{input_vals[0]} / {input_vals[1]}" + elif op_name == "block.maximum": + return f"np.maximum({input_vals[0]}, {input_vals[1]})" + elif op_name == "block.adds": + return f"{input_vals[0]} + {input_vals[1]}" + elif op_name == "block.subs": + return f"{input_vals[0]} - {input_vals[1]}" + elif op_name == "block.muls": + return f"{input_vals[0]} * {input_vals[1]}" + elif op_name == "block.divs": + return f"{input_vals[0]} / {input_vals[1]}" + elif op_name == "block.sqrt": + return f"np.sqrt({input_vals[0]})" + elif op_name == "block.rsqrt": + return f"1.0 / np.sqrt({input_vals[0]})" + elif op_name == "block.exp": + return f"np.exp(np.clip({input_vals[0]}, -10, 10))" + elif op_name == "block.neg": + return f"-{input_vals[0]}" + elif op_name == "block.recip": + return f"1.0 / {input_vals[0]}" + else: + return f"# 未知操作: {op_name}" + + def _generate_test_class( + self, + test_name: str, + kernels: List[Dict[str, Any]], + orch_info: Dict[str, Any], + numpy_code: str, + shape: Tuple[int, int], + ) -> str: + """生成完整的测试类代码 + + Args: + test_name: 测试名称 + kernels: 内核信息列表 + orch_info: Orchestration 信息 + numpy_code: NumPy 参考实现代码 + shape: 张量形状 + + Returns: + 完整的测试类代码 + """ + rows, cols = shape + class_name = f"Test{test_name.replace('_', ' ').title().replace(' ', '')}" + + # 收集所有输入及其实际形状 + input_shapes_map = {} # {input_name: shape} + for kernel in kernels: + for inp_name, inp_shape in kernel["inputs"]: + if inp_name not in input_shapes_map: + input_shapes_map[inp_name] = inp_shape + # 如果同一个输入在不同内核中有不同形状,使用较大的形状 + elif inp_shape != input_shapes_map[inp_name]: + existing_size = input_shapes_map[inp_name][0] * input_shapes_map[inp_name][1] + new_size = inp_shape[0] * inp_shape[1] + if new_size > existing_size: + input_shapes_map[inp_name] = inp_shape + + input_list = sorted(input_shapes_map.keys()) + + # 输出形状使用最后一个内核的输出形状 + output_shape = kernels[-1]["output_shape"] if kernels else shape + + # 生成头部 + code_lines = [ + f"class {class_name}(PTOTestCase):", + f" \"\"\"", + f" 测试用例: {test_name}", + f" 组合模式: {orch_info['mode']}", + f" 内核数量: {len(kernels)}", + f" \"\"\"", + f"", + f" def __init__(self, **kwargs):", + f" super().__init__(**kwargs)", + f" self.rows = {rows}", + f" self.cols = {cols}", + f"", + f" def get_name(self) -> str:", + f" return '{test_name}'", + f"", + f" def define_tensors(self) -> List[TensorSpec]:", + f" return [", + ] + + # 定义输入张量 - 使用实际形状 + for inp_name in input_list: + init_val = 2.0 + input_list.index(inp_name) * 0.5 + inp_shape = input_shapes_map[inp_name] + code_lines.append(f" TensorSpec('{inp_name}', [{inp_shape[0]}, {inp_shape[1]}], DataType.FP32, init_value={init_val}),") + + # 定义输出张量 - 使用实际输出形状 + code_lines.append(f" TensorSpec('output', [{output_shape[0]}, {output_shape[1]}], DataType.FP32, is_output=True),") + code_lines.append(f" ]") + code_lines.append(f"") + + # 生成 PyPTO 程序 + code_lines.append(f" def get_program(self) -> Any:") + code_lines.append(f" import pypto.language as pl") + code_lines.append(f"") + code_lines.append(f" @pl.program") + code_lines.append(f" class {test_name.replace('_', ' ').title().replace(' ', '')}Program:") + + # 添加所有内核(需要额外缩进) + for kernel in kernels: + # 使用统一的输入形状重新生成 kernel 代码 + regenerated_code = self._regenerate_kernel_code_with_unified_shapes(kernel, input_shapes_map) + # 为内核代码添加额外的8个空格缩进(4个用于get_program方法,4个用于@pl.program类) + kernel_lines = regenerated_code.split("\n") + for line in kernel_lines: + code_lines.append(f" {line}") + code_lines.append(f"") + + # 添加合并内核(如果需要) + if orch_info.get("needs_merge_kernel", False): + merge_code = self.orch_gen.generate_merge_kernel(shape) + merge_lines = merge_code.split("\n") + for line in merge_lines: + code_lines.append(f" {line}") + code_lines.append(f"") + + # 添加 Orchestration 函数 + orch_lines = orch_info["code"].split("\n") + for line in orch_lines: + code_lines.append(f" {line}") + code_lines.append(f"") + + code_lines.append(f" return {test_name.replace('_', ' ').title().replace(' ', '')}Program") + code_lines.append(f"") + + # 添加 NumPy 参考实现 + code_lines.append(f" def compute_expected(self, tensors, params=None):") + code_lines.append(f" \"\"\"使用 NumPy 计算期望输出\"\"\"") + code_lines.append(numpy_code) + code_lines.append(f"") + + # 根据组合模式生成计算逻辑 + if orch_info["mode"] == "sequential": + code_lines.append(f" # 顺序执行模式") + result_var = None + for i, kernel in enumerate(kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + + if i > 0 and result_var: + # 第一个输入使用前一个结果(变量名) + kernel_inputs[0] = result_var + # 构建参数列表:第一个是变量,其他从 tensors 获取 + inputs_parts = [kernel_inputs[0]] + for inp in kernel_inputs[1:]: + inputs_parts.append(f"tensors['{inp}']") + inputs_str = ", ".join(inputs_parts) + else: + # 第一个内核,所有输入都从 tensors 获取 + inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + + result_var = f"result_{i}" + code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + + code_lines.append(f" tensors['output'][:] = {result_var}") + + elif orch_info["mode"] == "branching": + code_lines.append(f" # 分支执行模式") + branch_results = [] + for i, kernel in enumerate(kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + result_var = f"branch_{i}" + branch_results.append(result_var) + + inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + + # 合并结果 + if len(branch_results) == 1: + code_lines.append(f" tensors['output'][:] = {branch_results[0]}") + else: + merged = branch_results[0] + for i in range(1, len(branch_results)): + new_merged = f"merged_{i}" + code_lines.append(f" {new_merged} = {merged} + {branch_results[i]}") + merged = new_merged + code_lines.append(f" tensors['output'][:] = {merged}") + + elif orch_info["mode"] == "mixed": + code_lines.append(f" # 混合执行模式") + mid = len(kernels) // 2 + parallel_kernels = kernels[:mid] + sequential_kernels = kernels[mid:] + + # 并行部分 + branch_results = [] + for i, kernel in enumerate(parallel_kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + result_var = f"parallel_{i}" + branch_results.append(result_var) + + inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + + # 合并并行结果 + if len(branch_results) > 1: + merged = branch_results[0] + for i in range(1, len(branch_results)): + new_merged = f"merged_parallel_{i}" + code_lines.append(f" {new_merged} = {merged} + {branch_results[i]}") + merged = new_merged + current_result = merged + else: + current_result = branch_results[0] + + # 顺序部分 + for i, kernel in enumerate(sequential_kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + kernel_inputs[0] = current_result + + result_var = f"sequential_{i}" + # 第一个输入是变量,其他是张量 + inputs_parts = [kernel_inputs[0]] + for inp in kernel_inputs[1:]: + inputs_parts.append(f"tensors['{inp}']") + inputs_str = ", ".join(inputs_parts) + code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + current_result = result_var + + code_lines.append(f" tensors['output'][:] = {current_result}") + + code_lines.append(f"") + + return "\n".join(code_lines) + + def generate_test_file( + self, + output_path: str, + test_configs: List[Dict[str, Any]], + ) -> None: + """生成完整的测试文件 + + Args: + output_path: 输出文件路径 + test_configs: 测试配置列表,每个配置包含: + - name: 测试名称 + - num_kernels: 内核数量 + - mode: 组合模式 + - shape: 张量形状 + - num_ops_range: 操作数量范围 + """ + # 生成文件头 + header = '''""" +自动生成的多内核模糊测试用例 + +该文件由 MultiKernelTestGenerator 自动生成。 +包含多个测试用例,每个测试用例包含多个 InCore 内核和一个 Orchestration 函数。 +""" + +import sys +from pathlib import Path +from typing import Any, List + +import numpy as np +import pytest + +from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec + +# 添加 pypto 到路径 +_FRAMEWORK_ROOT = Path(__file__).parent.parent.parent.parent +_PYPTO_ROOT = _FRAMEWORK_ROOT / "3rdparty" / "pypto" / "python" +if _PYPTO_ROOT.exists() and str(_PYPTO_ROOT) not in sys.path: + sys.path.insert(0, str(_PYPTO_ROOT)) + + +''' + + # 生成所有测试用例 + test_cases = [] + for config in test_configs: + test_code = self.generate_test_case( + test_name=config["name"], + num_kernels=config.get("num_kernels", 3), + orchestration_mode=config.get("mode", "sequential"), + shape=config.get("shape", (128, 128)), + num_ops_range=config.get("num_ops_range", (3, 7)), + input_shapes_list=config.get("input_shapes_list"), + ) + test_cases.append(test_code) + + # 生成测试套件 + test_suite = ''' + +class TestMultiKernelFuzzing: + """多内核模糊测试套件""" + +''' + + for config in test_configs: + test_name = config["name"] + class_name = f"Test{test_name.replace('_', ' ').title().replace(' ', '')}" + test_suite += f''' def test_{test_name}(self, test_runner): + """测试 {test_name}""" + test_case = {class_name}() + result = test_runner.run(test_case) + assert result.passed, f"测试失败: {{result.error}}" + +''' + + # 组合完整文件 + full_content = header + "\n\n".join(test_cases) + test_suite + + # 写入文件 + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text(full_content, encoding="utf-8") + + print(f"测试文件已生成: {output_path}") diff --git a/src/fuzzer/src/orchestrator_generator.py b/src/fuzzer/src/orchestrator_generator.py new file mode 100644 index 0000000..8d09fd0 --- /dev/null +++ b/src/fuzzer/src/orchestrator_generator.py @@ -0,0 +1,298 @@ +""" +Orchestration 组合函数生成器 + +该模块负责生成 @pl.function(type=pl.FunctionType.Orchestration) 函数, +用于组合多个 InCore 内核。支持三种组合模式: +- Sequential: 顺序执行内核 +- Branching: 分支执行内核 +- Mixed: 混合模式 +""" + +import random +from typing import Any, Dict, List, Optional, Tuple + + +class OrchestratorGenerator: + """生成 Orchestration 组合函数的生成器""" + + def __init__(self, seed: Optional[int] = None): + """初始化组合函数生成器 + + Args: + seed: 随机种子,用于可重现性 + """ + self.rng = random.Random(seed) + + def generate_sequential( + self, + kernels: List[Dict[str, Any]], + shape: Tuple[int, int] = (128, 128), + ) -> Dict[str, Any]: + """生成顺序执行模式的 Orchestration 函数 + + 在顺序模式中,每个内核的输出作为下一个内核的输入。 + + Args: + kernels: 内核信息列表 + shape: 张量形状 + + Returns: + 包含组合函数信息的字典 + """ + if not kernels: + raise ValueError("至少需要一个内核") + + # 收集所有需要的输入及其形状 + input_shapes_map = {} # {input_name: shape} + for kernel in kernels: + for inp_name, inp_shape in kernel["inputs"]: + if inp_name not in input_shapes_map: + input_shapes_map[inp_name] = inp_shape + # 如果同一个输入在不同内核中有不同形状,使用较大的形状 + elif inp_shape != input_shapes_map[inp_name]: + existing_size = input_shapes_map[inp_name][0] * input_shapes_map[inp_name][1] + new_size = inp_shape[0] * inp_shape[1] + if new_size > existing_size: + input_shapes_map[inp_name] = inp_shape + + # 生成函数签名 + input_params = sorted(input_shapes_map.keys()) + params = [] + for name in input_params: + inp_shape = input_shapes_map[name] + params.append(f"{name}: pl.Tensor[[{inp_shape[0]}, {inp_shape[1]}], pl.FP32]") + + # 输出形状使用最后一个内核的输出形状 + output_shape = kernels[-1]["output_shape"] + rows, cols = output_shape + + code_lines = [ + " @pl.function(type=pl.FunctionType.Orchestration)", + f" def orchestrator(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", + ] + + # 顺序调用内核 + result_var = None + for i, kernel in enumerate(kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + + # 如果不是第一个内核,使用前一个内核的输出 + if i > 0 and result_var: + # 替换第一个输入为前一个内核的输出 + kernel_inputs[0] = result_var + + result_var = f"result_{i}" + inputs_str = ", ".join(kernel_inputs) + code_lines.append(f" {result_var} = self.{kernel_name}({inputs_str})") + + # 返回最后一个结果 + code_lines.append(f" return {result_var}") + + return { + "mode": "sequential", + "code": "\n".join(code_lines), + "inputs": input_params, + "output_shape": output_shape, + } + + def generate_branching( + self, + kernels: List[Dict[str, Any]], + shape: Tuple[int, int] = (128, 128), + ) -> Dict[str, Any]: + """生成分支执行模式的 Orchestration 函数 + + 在分支模式中,多个内核并行执行,然后合并结果。 + + Args: + kernels: 内核信息列表 + shape: 张量形状 + + Returns: + 包含组合函数信息的字典 + """ + if not kernels: + raise ValueError("至少需要一个内核") + + # 收集所有需要的输入及其形状 + input_shapes_map = {} # {input_name: shape} + for kernel in kernels: + for inp_name, inp_shape in kernel["inputs"]: + if inp_name not in input_shapes_map: + input_shapes_map[inp_name] = inp_shape + # 如果同一个输入在不同内核中有不同形状,使用较大的形状 + elif inp_shape != input_shapes_map[inp_name]: + existing_size = input_shapes_map[inp_name][0] * input_shapes_map[inp_name][1] + new_size = inp_shape[0] * inp_shape[1] + if new_size > existing_size: + input_shapes_map[inp_name] = inp_shape + + # 生成函数签名 + input_params = sorted(input_shapes_map.keys()) + params = [] + for name in input_params: + inp_shape = input_shapes_map[name] + params.append(f"{name}: pl.Tensor[[{inp_shape[0]}, {inp_shape[1]}], pl.FP32]") + + # 输出形状:在分支模式中,所有分支应该有相同的输出形状 + # 使用第一个内核的输出形状 + output_shape = kernels[0]["output_shape"] + rows, cols = output_shape + + code_lines = [ + " @pl.function(type=pl.FunctionType.Orchestration)", + f" def orchestrator(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", + ] + + # 并行执行所有内核 + result_vars = [] + for i, kernel in enumerate(kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + result_var = f"branch_{i}" + result_vars.append(result_var) + + inputs_str = ", ".join(kernel_inputs) + code_lines.append(f" {result_var} = self.{kernel_name}({inputs_str})") + + # 合并所有分支结果 + if len(result_vars) == 1: + code_lines.append(f" return {result_vars[0]}") + else: + # 使用 add 操作合并结果 + code_lines.append(f" # 合并所有分支结果") + merged = result_vars[0] + for i in range(1, len(result_vars)): + new_merged = f"merged_{i}" + code_lines.append(f" {new_merged} = self.merge_results({merged}, {result_vars[i]})") + merged = new_merged + code_lines.append(f" return {merged}") + + return { + "mode": "branching", + "code": "\n".join(code_lines), + "inputs": input_params, + "output_shape": output_shape, + "needs_merge_kernel": len(result_vars) > 1, + } + + def generate_mixed( + self, + kernels: List[Dict[str, Any]], + shape: Tuple[int, int] = (128, 128), + ) -> Dict[str, Any]: + """生成混合模式的 Orchestration 函数 + + 混合模式结合了顺序和分支执行。 + + Args: + kernels: 内核信息列表 + shape: 张量形状 + + Returns: + 包含组合函数信息的字典 + """ + if len(kernels) < 2: + # 如果内核数量少于2,使用顺序模式 + return self.generate_sequential(kernels, shape) + + # 收集所有需要的输入及其形状 + input_shapes_map = {} # {input_name: shape} + for kernel in kernels: + for inp_name, inp_shape in kernel["inputs"]: + if inp_name not in input_shapes_map: + input_shapes_map[inp_name] = inp_shape + # 如果同一个输入在不同内核中有不同形状,使用较大的形状 + elif inp_shape != input_shapes_map[inp_name]: + existing_size = input_shapes_map[inp_name][0] * input_shapes_map[inp_name][1] + new_size = inp_shape[0] * inp_shape[1] + if new_size > existing_size: + input_shapes_map[inp_name] = inp_shape + + # 生成函数签名 + input_params = sorted(input_shapes_map.keys()) + params = [] + for name in input_params: + inp_shape = input_shapes_map[name] + params.append(f"{name}: pl.Tensor[[{inp_shape[0]}, {inp_shape[1]}], pl.FP32]") + + # 输出形状使用最后一个内核的输出形状 + output_shape = kernels[-1]["output_shape"] + rows, cols = output_shape + + code_lines = [ + " @pl.function(type=pl.FunctionType.Orchestration)", + f" def orchestrator(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", + ] + + # 将内核分成两组:前半部分并行,后半部分顺序 + mid = len(kernels) // 2 + parallel_kernels = kernels[:mid] + sequential_kernels = kernels[mid:] + + # 并行执行前半部分 + branch_results = [] + for i, kernel in enumerate(parallel_kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + result_var = f"parallel_{i}" + branch_results.append(result_var) + + inputs_str = ", ".join(kernel_inputs) + code_lines.append(f" {result_var} = self.{kernel_name}({inputs_str})") + + # 合并并行结果 + if len(branch_results) > 1: + code_lines.append(f" # 合并并行结果") + merged = branch_results[0] + for i in range(1, len(branch_results)): + new_merged = f"merged_parallel_{i}" + code_lines.append(f" {new_merged} = self.merge_results({merged}, {branch_results[i]})") + merged = new_merged + current_result = merged + else: + current_result = branch_results[0] + + # 顺序执行后半部分 + for i, kernel in enumerate(sequential_kernels): + kernel_name = kernel["name"] + kernel_inputs = [inp[0] for inp in kernel["inputs"]] + + # 使用前一个结果作为第一个输入 + kernel_inputs[0] = current_result + + result_var = f"sequential_{i}" + inputs_str = ", ".join(kernel_inputs) + code_lines.append(f" {result_var} = self.{kernel_name}({inputs_str})") + current_result = result_var + + # 返回最终结果 + code_lines.append(f" return {current_result}") + + return { + "mode": "mixed", + "code": "\n".join(code_lines), + "inputs": input_params, + "output_shape": output_shape, + "needs_merge_kernel": len(branch_results) > 1, + } + + def generate_merge_kernel(self, shape: Tuple[int, int] = (128, 128)) -> str: + """生成用于合并结果的辅助内核 + + Args: + shape: 张量形状 + + Returns: + 合并内核的代码字符串 + """ + rows, cols = shape + code = f""" @pl.function(type=pl.FunctionType.InCore) + def merge_results(self, a: pl.Tensor[[{rows}, {cols}], pl.FP32], + b: pl.Tensor[[{rows}, {cols}], pl.FP32]) -> pl.Tensor[[{rows}, {cols}], pl.FP32]: + tile_a = pl.op.block.load(a, 0, 0, {rows}, {cols}) + tile_b = pl.op.block.load(b, 0, 0, {rows}, {cols}) + result = pl.op.block.add(tile_a, tile_b) + return result""" + return code From b8598bd59e40ba1a6a8a712b995c52c8d71298e8 Mon Sep 17 00:00:00 2001 From: majin0824 Date: Wed, 11 Feb 2026 11:48:49 +0800 Subject: [PATCH 2/3] =?UTF-8?q?[WIP]=E8=B0=83=E6=95=B4fuzz?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/fuzzer/CHANGELOG.md | 332 ++++++++++++++ src/fuzzer/OP_RULES.md | 370 ++++++++++++++++ src/fuzzer/README.md | 125 ++++-- src/fuzzer/example_multi_kernel.py | 31 +- .../generated_tests/test_fuzz_multi_kernel.py | 108 ++--- src/fuzzer/src/fuzzer.py | 137 +++++- src/fuzzer/src/kernel_generator.py | 55 ++- src/fuzzer/src/multi_kernel_test_generator.py | 78 +++- src/fuzzer/src/orchestrator_generator.py | 19 +- tests/test_cases/test_expand.py | 404 ++++++++++++++++++ 10 files changed, 1524 insertions(+), 135 deletions(-) create mode 100644 src/fuzzer/CHANGELOG.md create mode 100644 src/fuzzer/OP_RULES.md create mode 100644 tests/test_cases/test_expand.py diff --git a/src/fuzzer/CHANGELOG.md b/src/fuzzer/CHANGELOG.md new file mode 100644 index 0000000..c6cc5f2 --- /dev/null +++ b/src/fuzzer/CHANGELOG.md @@ -0,0 +1,332 @@ +# Fuzzer 框架更新日志 + +## 2026-02-11 - 输入形状一致性修复 + +### Bug 修复 + +**修复形状不一致导致的 NumPy 广播错误** + +**问题**: 测试用例配置中使用了不同维度的输入形状,导致 NumPy 参考实现中出现广播错误 +```python +# ✗ 错误:不同维度的输入 +"input_shapes_list": [ + [(128, 128), (64, 64)], # kernel_0: 不同维度 + [(128, 128), (128, 128), (96, 96)], # kernel_1: 混合维度 +] + +# NumPy 计算时报错: +# ValueError: operands could not be broadcast together with shapes (128,128) (96,96) +``` + +**修复后**: +```python +# ✓ 正确:所有输入使用相同维度 +"input_shapes_list": [ + [(128, 128), (128, 128)], # kernel_0: 统一维度 + [(128, 128), (128, 128), (128, 128)], # kernel_1: 统一维度 +] +``` + +**影响范围**: +- **src/fuzzer/example_multi_kernel.py**: + - `fuzz_sequential_simple`: 所有输入改为 128x128 + - `fuzz_branching_parallel`: 所有输入改为 128x128 + - `fuzz_branching_wide`: 所有输入改为 128x128 + +**根本原因**: +- 当内核中有操作涉及不同形状的输入时(如 96x96 和 128x128),会导致 NumPy 广播失败 +- 虽然 PyPTO IR 代码生成时使用了正确的 load 形状,但运算过程中仍会出现形状不匹配 + +**设计决策**: +- 简化测试用例配置,统一使用相同形状的输入 +- 避免在计算过程中处理复杂的形状变换逻辑 +- 确保 NumPy 参考实现和 PyPTO IR 代码行为一致 + +**症状**: `ValueError: operands could not be broadcast together with shapes (128,128) (96,96)` + +--- + +## 2026-02-11 - NumPy 嵌套函数修复 + +### Bug 修复 + +**修复 compute_expected 中嵌套函数的 self 参数问题** + +**问题**: 生成的 NumPy 参考实现函数包含了错误的 `self` 参数 +```python +def compute_expected(self, tensors, params=None): + def _numpy_kernel_0(self, a, b): # ✗ 错误:嵌套函数不应该有 self + ... + result_0 = self._numpy_kernel_0(...) # ✗ 错误调用方式 +``` + +**修复后**: +```python +def compute_expected(self, tensors, params=None): + def _numpy_kernel_0(a, b): # ✓ 正确:嵌套函数不需要 self + ... + result_0 = _numpy_kernel_0(...) # ✓ 正确:直接调用 +``` + +**影响范围**: +- **src/fuzzer/src/multi_kernel_test_generator.py**: + - `_generate_numpy_reference()`: 移除嵌套函数的 `self` 参数(第281行) + - `_generate_test_class()`: 所有调用改为直接调用而不使用 `self.`(第532、546、574、599行) + +**症状**: `NameError: name 'self' is not defined` + +--- + +## 2026-02-11 - 形状大小限制 + +### 性能优化 + +**限制最大形状尺寸**: 避免内存溢出,将最大形状从 256x256 限制到 128x128 + +**变更内容**: +1. **fuzzer.py**: + - `get_aligned_shapes()`: 添加 `max_size` 参数,默认 128 + - `generate_aligned_shape()`: 默认 `max_size` 改为 128 + - 常用行数列表从 `[32, 64, 80, 96, 128, 160, 192, 224, 256]` 改为 `[32, 64, 80, 96, 128]` + +2. **example_multi_kernel.py**: + - 所有 256x256 形状改为 96x96 + - 示例配置使用更小、更安全的形状组合 + +**原因**: +- 避免超过硬件内存限制 +- 提高测试执行速度 +- 减少内存分配失败的风险 + +**影响**: +- 生成的测试用例形状范围: 32x32 到 128x128 +- 对齐的列数: 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128 + +--- + +## 2026-02-11 - Orchestrator 模式修正 + +### 架构变更 (Breaking Change) + +**Orchestrator 不再显式创建 tensor**: 修正 Orchestration 函数以匹配 PyPTO 框架的正确模式 + +**变更前** (错误): +```python +@pl.function(type=pl.FunctionType.Orchestration) +def orchestrator(self, a: ..., b: ...) -> ...: + # ✗ 错误: 不应该显式创建 tensor + tmp_0 = pl.tensor.create([128, 128], pl.FP32) + tmp_1 = pl.tensor.create([128, 128], pl.FP32) + + tmp_0 = self.kernel_0(a, b, tmp_0) + tmp_1 = self.kernel_1(tmp_0, b, tmp_1) + return tmp_1 +``` + +**变更后** (正确): +```python +@pl.function(type=pl.FunctionType.Orchestration) +def orchestrator(self, a: ..., b: ...) -> ...: + # ✓ 正确: 框架自动管理输出 tensor + result_0 = self.kernel_0(a, b) + result_1 = self.kernel_1(result_0, b) + return result_1 +``` + +**关键区别**: +1. **Orchestration 函数不创建 tensor**: 移除所有 `pl.tensor.create()` 调用 +2. **调用 InCore 函数时只传入输入参数**: 不需要传递输出 tensor +3. **框架自动管理输出**: PyPTO 框架会自动分配和管理 InCore 函数的输出 tensor + +**InCore 函数签名保持不变**: +```python +@pl.function(type=pl.FunctionType.InCore) +def kernel_0(self, a: ..., b: ..., output: ...) -> ...: + # InCore 函数仍然需要 output 参数 + tile_a = pl.load(a, ...) + result = pl.store(tile_result, ..., output_tensor=output) + return result +``` + +### 影响范围 + +- **src/fuzzer/src/orchestrator_generator.py**: + - `generate_sequential()`: 移除 tensor 创建,简化 kernel 调用 + - `generate_branching()`: 已经正确,无需修改 + - `generate_mixed()`: 已经正确,无需修改 + - `generate_merge_kernel()`: 移除对齐验证(仍保留 output 参数) + +### 参考 + +- [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py): Orchestration 模式参考 +- [tests/test_cases/test_matmul.py](../../tests/test_cases/test_matmul.py): Orchestration 模式参考 + +--- + +## 2026-02-11 - 形状对齐约束和验证 + +### 新增功能 + +1. **32字节对齐约束** (fuzzer.py) + - 添加 `is_shape_aligned()` 函数验证形状是否满足32字节对齐 + - 添加 `get_aligned_shapes()` 函数获取所有对齐的常用形状 + - 添加 `generate_aligned_shape()` 函数随机生成对齐的形状 + - 支持多种数据类型: FP32, FP16, INT32, INT8 + +2. **自动形状验证** (kernel_generator.py) + - `generate_kernel()` 自动验证输入输出形状 + - 检测到不对齐的形状时自动生成对齐的替代形状 + - 打印警告信息提示形状不对齐 + +3. **Orchestrator 形状验证** (orchestrator_generator.py) + - 在创建临时 tensor 时验证形状对齐 + - 在 merge_kernel 生成时验证形状对齐 + - 打印警告信息提示不对齐的形状 + +4. **文档更新** (OP_RULES.md) + - 新增第 0 节: 形状对齐约束 + - 详细说明32字节对齐规则 + - 提供对齐和不对齐的形状示例 + - 说明 Fuzzer 中的对齐验证工具 + +### 对齐规则 + +**核心约束**: +- 形状的尾轴(列数)必须满足: `cols == 1` 或 `(cols * sizeof(dtype)) % 32 == 0` + +**FP32 类型的有效尾轴值**: +- 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, ..., 128, ... + +**示例**: +```python +# ✓ 有效 +(128, 1) # 尾轴=1 +(128, 8) # 8*4=32 +(128, 64) # 64*4=256 +(128, 128) # 128*4=512 + +# ✗ 无效 +(128, 3) # 3*4=12, 不对齐 +(128, 5) # 5*4=20, 不对齐 +(128, 10) # 10*4=40, 不对齐 +``` + +### 影响范围 + +- **src/fuzzer/src/fuzzer.py**: 新增对齐验证工具函数 +- **src/fuzzer/src/kernel_generator.py**: 导入并使用对齐验证 +- **src/fuzzer/src/orchestrator_generator.py**: 导入并使用对齐验证 +- **src/fuzzer/OP_RULES.md**: 新增第 0 节文档 + +### 向后兼容性 + +- 现有代码如果使用不对齐的形状,会自动修正并打印警告 +- 不会导致生成失败,而是自动选择最接近的对齐形状 +- 建议手动检查生成的代码,确保形状符合预期 + +--- + +## 2026-02-11 - API 简化和算子扩展 + +### API 变更 (Breaking Change) + +**简化 PyPTO API 调用**: 将 `pl.op.block.xxx` 简化为 `pl.xxx` + +**变更前**: +```python +tile_a = pl.op.block.load(a, 0, 0, 128, 128) +tmp_0 = pl.op.block.add(tile_a, tile_b) +result = pl.op.block.relu(tmp_0) +``` + +**变更后**: +```python +tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) +tmp_0 = pl.add(tile_a, tile_b) +result = pl.relu(tmp_0) +``` + +**影响范围**: +- `kernel_generator.py`: 内核代码生成 +- `multi_kernel_test_generator.py`: 测试类代码生成 +- `orchestrator_generator.py`: 合并内核生成 +- `OP_RULES.md`: 文档示例 +- `README.md`: 文档示例 + +### 新增功能 + +1. **扩展算子支持** (fuzzer.py) + - 新增一元算子: `log`, `abs`, `relu` + - 新增二元算子: `minimum` + - 新增高级算子组: + - Row expand 系列: `row_expand_add`, `row_expand_sub`, `row_expand_mul`, `row_expand_div` + - Matrix 算子: `matmul` + +2. **高级算子开关** + - 添加 `enable_advanced_ops` 参数到所有生成器类 + - 基础模式: 使用标准算子 (add, mul, sqrt, exp等) + - 高级模式: 额外包含 row_expand 和 matmul 算子 + +3. **算子规则文档** ([OP_RULES.md](OP_RULES.md)) + - 完整的算子分类和定义 + - 每个算子的形状约束说明 + - 常见算子组合模式 (Softmax, LayerNorm, GELU, ReLU变体等) + - 禁止的算子组合和约束处理 + - Fuzzer 生成策略建议 + +### 修改文件 + +1. **src/fuzzer.py** + - 扩展 `BLOCK_UNARY_OPS`: 新增 log, abs, relu + - 扩展 `BLOCK_BINARY_OPS`: 新增 minimum + - 新增 `BLOCK_ROW_EXPAND_OPS`: row_expand_* 系列 + - 新增 `BLOCK_MATRIX_OPS`: matmul + - 添加 `enable_advanced_ops` 参数 + - 简化 row_expand 操作的输入类型定义 + +2. **src/kernel_generator.py** + - 添加 `enable_advanced_ops` 参数支持 + - 传递高级算子开关到 OpFuzzer + +3. **src/multi_kernel_test_generator.py** + - 添加 `enable_advanced_ops` 参数支持 + - 更新 `_get_numpy_operation` 方法支持所有新算子: + - log, abs, relu, minimum + - row_expand_add, row_expand_sub, row_expand_mul, row_expand_div + - matmul + +4. **example_multi_kernel.py** + - 添加 `--enable-advanced-ops` 命令行参数 + - 在输出中显示是否启用高级算子 + +5. **README.md** + - 更新快速开始部分,区分基础和高级示例 + - 添加高级算子使用说明 + - 添加对 OP_RULES.md 的引用 + - 更新算子列表和约束说明 + +### 使用方法 + +#### 基础模式 (默认) +```bash +python src/fuzzer/example_multi_kernel.py --num-cases 3 +``` +使用算子: add, sub, mul, div, maximum, minimum, adds, subs, muls, divs, sqrt, rsqrt, exp, neg, recip, log, abs, relu + +#### 高级模式 +```bash +python src/fuzzer/example_multi_kernel.py --num-cases 3 --enable-advanced-ops +``` +额外包含: row_expand_add, row_expand_sub, row_expand_mul, row_expand_div, matmul + +### 算子约束 + +- **avoid_zero**: div, divs, recip, row_expand_div +- **positive_only**: sqrt, rsqrt, log +- **row_vec_required**: row_expand_* 系列 (第二个输入需要 [M,1] 形状) + +### 参考文档 + +- [OP_RULES.md](OP_RULES.md) - 完整的算子规则和组合模式 +- [README.md](README.md) - 框架使用文档 +- [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py) - row_expand 使用示例 diff --git a/src/fuzzer/OP_RULES.md b/src/fuzzer/OP_RULES.md new file mode 100644 index 0000000..a1bac79 --- /dev/null +++ b/src/fuzzer/OP_RULES.md @@ -0,0 +1,370 @@ +# PyPTO 算子组合规则 (Op Combination Rules) + +本文档定义了 PyPTO IR 中所有支持的算子及其组合规则,用于指导 fuzzer 生成合法的算子组合。 + +## 0. 形状对齐约束 (Shape Alignment Constraints) + +### 0.1 32字节对齐规则 + +**重要**: 所有 tensor 创建和 reshape 操作必须满足 32 字节对齐约束。 + +**规则**: +- 形状的**尾轴**(最后一个维度,即列数)必须满足以下条件之一: + 1. 尾轴 = 1, 或者 + 2. (尾轴 × sizeof(datatype)) % 32 == 0 + +**数据类型大小**: +- FP32: 4 字节 +- FP16: 2 字节 +- INT32: 4 字节 +- INT8: 1 字节 + +**FP32 类型的有效尾轴值**: +- 尾轴 = 1 (总是有效) +- 尾轴 % 8 == 0 (因为 8 × 4 = 32) +- 有效值: 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, ... + +**示例 (FP32)**: +```python +# ✓ 有效的形状 +pl.tensor.create([128, 1], pl.FP32) # 尾轴=1 +pl.tensor.create([128, 8], pl.FP32) # 8*4=32, 对齐 +pl.tensor.create([128, 16], pl.FP32) # 16*4=64, 对齐 +pl.tensor.create([128, 32], pl.FP32) # 32*4=128, 对齐 +pl.tensor.create([128, 64], pl.FP32) # 64*4=256, 对齐 +pl.tensor.create([128, 128], pl.FP32) # 128*4=512, 对齐 + +# ✗ 无效的形状 +pl.tensor.create([128, 3], pl.FP32) # 3*4=12, 不对齐 +pl.tensor.create([128, 5], pl.FP32) # 5*4=20, 不对齐 +pl.tensor.create([128, 7], pl.FP32) # 7*4=28, 不对齐 +pl.tensor.create([128, 10], pl.FP32) # 10*4=40, 不对齐 (40 % 32 = 8) +``` + +**Reshape 约束**: +```python +# 示例: reshape 操作也必须满足对齐约束 +tile_tmp = pl.create_tile([8, 1], dtype=pl.FP32, target_memory=1) # ✓ 尾轴=1 +tile_reshaped = pl.reshape(tile_tmp, [1, 8]) # ✓ 尾轴=8, 8*4=32 + +# ✗ 错误示例 +tile_bad = pl.reshape(tile_tmp, [2, 4]) # ✗ 尾轴=4, 4*4=16, 不对齐 +``` + +### 0.2 Fuzzer 中的对齐验证 + +Fuzzer 框架提供以下工具函数: + +```python +from src.fuzzer.src.fuzzer import is_shape_aligned, generate_aligned_shape, get_aligned_shapes + +# 检查形状是否对齐 +is_valid = is_shape_aligned((128, 64), dtype="FP32") # True +is_valid = is_shape_aligned((128, 5), dtype="FP32") # False + +# 生成随机的对齐形状 +shape = generate_aligned_shape(rng, dtype="FP32", max_size=256) + +# 获取所有常用的对齐形状列表 +all_shapes = get_aligned_shapes(dtype="FP32") +``` + +**Fuzzer 自动处理**: +- `KernelGenerator.generate_kernel()` 会自动验证并修正输入/输出形状 +- `OrchestratorGenerator` 会验证所有临时 tensor 的形状 +- 如果检测到不对齐的形状,会打印警告并自动生成对齐的形状 + +## 1. 算子分类 (Operator Categories) + +### 1.1 Block Memory Operations (内存操作) + +| 算子名 | 输入类型 | 输出类型 | 参数 | 约束 | +|--------|----------|----------|------|------| +| `block.load` | `tensor` | `tile` | `offsets: [int, int]`, `shapes: [int, int]`, `target_memory: int` | target_memory ∈ {1, 2} (UB, L1) | +| `block.store` | `tile` | `tensor` | `offsets: [int, int]`, `shapes: [int, int]`, `output_tensor: tensor` | - | +| `block.l0c_store` | `tile` | `tensor` | `offsets: [int, int]`, `shapes: [int, int]`, `output_tensor: tensor` | - | +| `block.move` | `tile` | `tile` | `target_memory: int`, `transpose: bool` | target_memory ∈ {1, 2, 3, 4} | +| `block.create_tile` | - | `tile` | `shape: [int, int]`, `dtype: DataType`, `target_memory: int` | - | +| `block.full` | - | `tile` | `shape: [int, int]`, `dtype: DataType`, `value: float` | 创建填充值的tile | + +### 1.2 Block Element-wise Binary Operations (逐元素二元操作) + +| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | +|--------|----------|----------|----------|-----------| +| `block.add` | `tile, tile` | `tile` | 支持广播 | `a + b` | +| `block.sub` | `tile, tile` | `tile` | 支持广播 | `a - b` | +| `block.mul` | `tile, tile` | `tile` | 支持广播 | `a * b` | +| `block.div` | `tile, tile` | `tile` | 支持广播,避免除零 | `a / b` | +| `block.maximum` | `tile, tile` | `tile` | 支持广播 | `np.maximum(a, b)` | +| `block.minimum` | `tile, tile` | `tile` | 支持广播 | `np.minimum(a, b)` | +| `block.cmp` | `tile, tile` | `tile` | 支持广播 | 比较操作,cmp_type: 0=EQ, 1=NE, 2=LT, 3=LE, 4=GT, 5=GE | + +### 1.3 Block Scalar Operations (标量操作) + +| 算子名 | 输入类型 | 输出类型 | NumPy等价 | +|--------|----------|----------|-----------| +| `block.adds` | `tile, scalar` | `tile` | `a + s` | +| `block.subs` | `tile, scalar` | `tile` | `a - s` | +| `block.muls` | `tile, scalar` | `tile` | `a * s` | +| `block.divs` | `tile, scalar` | `tile` | `a / s` (避免除零) | +| `block.cmps` | `tile, scalar` | `tile` | 比较操作 | + +### 1.4 Block Unary Operations (一元操作) + +| 算子名 | 输入类型 | 输出类型 | 约束 | NumPy等价 | +|--------|----------|----------|------|-----------| +| `block.neg` | `tile` | `tile` | - | `-a` | +| `block.exp` | `tile` | `tile` | 建议输入范围 [-10, 10] | `np.exp(a)` | +| `block.recip` | `tile` | `tile` | 避免除零 | `1.0 / a` | +| `block.sqrt` | `tile` | `tile` | 输入必须 ≥ 0 | `np.sqrt(a)` | +| `block.rsqrt` | `tile` | `tile` | 输入必须 > 0 | `1.0 / np.sqrt(a)` | +| `block.log` | `tile` | `tile` | 输入必须 > 0 | `np.log(a)` | +| `block.abs` | `tile` | `tile` | - | `np.abs(a)` | +| `block.relu` | `tile` | `tile` | - | `np.maximum(0, a)` | +| `block.cast` | `tile` | `tile` | 参数: `target_dtype: DataType`, `mode: int` | 类型转换 | + +### 1.5 Block Matrix Operations (矩阵操作) + +| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | +|--------|----------|----------|----------|-----------| +| `block.matmul` | `tile, tile` | `tile` | `[M, K] @ [K, N] -> [M, N]` | `a @ b` | +| `block.matmul_acc` | `tile, tile, tile` | `tile` | `acc + (lhs @ rhs)` | `acc + a @ b` | + +### 1.6 Block Row/Column Broadcast Operations (行列广播操作) + +**重要**: 这些操作用于处理向量与矩阵的广播运算。 + +| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | +|--------|----------|----------|----------|-----------| +| `block.row_expand_add` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile + row_vec` | +| `block.row_expand_sub` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile - row_vec` | +| `block.row_expand_mul` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile * row_vec` | +| `block.row_expand_div` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行,避免除零 | `tile / row_vec` | +| `block.col_expand` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | 列向量扩展 | +| `block.col_expand_mul` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | `tile * col_vec` | +| `block.col_expand_div` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列,避免除零 | `tile / col_vec` | +| `block.col_expand_sub` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | `tile - col_vec` | +| `block.expands` | `tile[M,N], scalar` | `tile[M,N]` | 标量广播到tile形状 | 标量扩展 | + +### 1.7 Block Reduction Operations (归约操作) + +| 算子名 | 输入类型 | 输出类型 | 参数 | 形状变换 | NumPy等价 | +|--------|----------|----------|------|----------|-----------| +| `block.sum` | `tile` | `tile` | `axis: int`, `keepdim: bool` | axis=1, keepdim=True: [M,N]->[M,1] | `np.sum(a, axis=axis, keepdims=keepdim)` | +| `block.max` | `tile` | `tile` | `axis: int`, `keepdim: bool` | 同上 | `np.max(a, axis=axis, keepdims=keepdim)` | +| `block.min` | `tile` | `tile` | `axis: int`, `keepdim: bool` | 同上 | `np.min(a, axis=axis, keepdims=keepdim)` | +| `block.row_sum` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.sum(a, axis=1, keepdims=True)` | +| `block.row_max` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.max(a, axis=1, keepdims=True)` | +| `block.row_min` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.min(a, axis=1, keepdims=True)` | + +### 1.8 Block Transform Operations (变换操作) + +| 算子名 | 输入类型 | 输出类型 | 参数 | 形状变换 | +|--------|----------|----------|------|----------| +| `block.reshape` | `tile` | `tile` | `shape: [int, int]` | 重塑形状 | +| `block.transpose` | `tile` | `tile` | `axis1: int`, `axis2: int` | 交换维度 | +| `block.view` | `tile` | `tile` | `shape: [int, int]`, `offset: [int, int]` | 创建视图 | + +### 1.9 Tensor-level Operations (Tensor级别操作) + +| 算子名 | 输入类型 | 输出类型 | 说明 | +|--------|----------|----------|------| +| `tensor.create` | - | `tensor` | 创建tensor | +| `tensor.view` | `tensor` | `tensor` | 创建tensor视图 | +| `tensor.matmul` | `tensor, tensor` | `tensor` | tensor级矩阵乘法 | +| `tensor.mul` | `tensor, tensor/scalar` | `tensor` | tensor级乘法 | +| `tensor.add` | `tensor, tensor/scalar` | `tensor` | tensor级加法 | +| `tensor.sub` | `tensor, tensor/scalar` | `tensor` | tensor级减法 | +| `tensor.div` | `tensor, tensor/scalar` | `tensor` | tensor级除法 | + +## 2. 算子组合规则 (Combination Rules) + +### 2.1 基本组合规则 + +1. **类型匹配**: 操作符的输入类型必须匹配 + - `tile` 操作符接受 `tile` 类型 + - `tensor` 操作符接受 `tensor` 类型 + - 不能混用 + +2. **形状兼容性**: + - 二元操作支持广播:`[M,N] op [M,N]`, `[M,N] op [M,1]`, `[M,N] op [1,N]` + - Row expand 操作: 第二个输入必须是 `[M,1]` 形状 + - Col expand 操作: 第二个输入必须是 `[1,N]` 形状 + - Matmul: `[M,K] @ [K,N] -> [M,N]` + +3. **数据约束**: + - **避免除零**: `div`, `divs`, `recip`, `row_expand_div`, `col_expand_div` + - 确保分母绝对值 ≥ 0.01 + - **正值约束**: `sqrt`, `rsqrt`, `log` + - 确保输入 > 0 或使用 `abs(x) + 1e-6` + - **范围约束**: `exp` + - 建议输入范围 [-10, 10] 避免溢出 + +### 2.2 常见算子组合模式 + +#### 模式1: Softmax 组件 +```python +# Step 1: Row max reduction +max_vals = pl.row_max(tile, tmp_tile) # [M,N] -> [M,1] + +# Step 2: Subtract max (数值稳定性) +centered = pl.row_expand_sub(tile, max_vals) # [M,N] - [M,1] -> [M,N] + +# Step 3: Exponential +exp_vals = pl.exp(centered) # [M,N] -> [M,N] + +# Step 4: Row sum +sum_vals = pl.row_sum(exp_vals, tmp_tile) # [M,N] -> [M,1] + +# Step 5: Normalize +output = pl.row_expand_div(exp_vals, sum_vals) # [M,N] / [M,1] -> [M,N] +``` + +#### 模式2: Layer Normalization 组件 +```python +# Step 1: Row mean (使用 sum + divs) +row_sum = pl.row_sum(tile, tmp_tile) # [M,N] -> [M,1] +row_mean = pl.divs(row_sum, N) # [M,1] / scalar -> [M,1] + +# Step 2: Subtract mean +centered = pl.row_expand_sub(tile, row_mean) # [M,N] - [M,1] -> [M,N] + +# Step 3: Squared +squared = pl.mul(centered, centered) # [M,N] * [M,N] -> [M,N] + +# Step 4: Variance +var_sum = pl.row_sum(squared, tmp_tile) # [M,N] -> [M,1] +variance = pl.divs(var_sum, N) # [M,1] / scalar -> [M,1] + +# Step 5: Inverse std +inv_std = pl.rsqrt(variance) # [M,1] -> [M,1] + +# Step 6: Normalize +output = pl.row_expand_mul(centered, inv_std) # [M,N] * [M,1] -> [M,N] +``` + +#### 模式3: GELU 近似 +```python +# GELU(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3))) +# 简化版本: 使用 sigmoid 近似 +# GELU(x) ≈ x * sigmoid(1.702 * x) + +# Step 1: Scale +scaled = pl.muls(tile, 1.702) # [M,N] * scalar -> [M,N] + +# Step 2: Sigmoid approximation (使用 exp) +neg_scaled = pl.neg(scaled) # -[M,N] +exp_neg = pl.exp(neg_scaled) # exp(-scaled) +one_plus_exp = pl.adds(exp_neg, 1.0) # 1 + exp(-scaled) +sigmoid = pl.recip(one_plus_exp) # 1 / (1 + exp(-scaled)) + +# Step 3: Multiply +output = pl.mul(tile, sigmoid) # [M,N] * [M,N] -> [M,N] +``` + +#### 模式4: ReLU 及变体 +```python +# ReLU +output = pl.relu(tile) + +# LeakyReLU (alpha=0.01) +neg_part = pl.muls(tile, 0.01) # 负半部分 +output = pl.maximum(tile, neg_part) # max(x, 0.01*x) + +# ELU (alpha=1.0) - 简化版 +# ELU(x) = x if x > 0 else alpha * (exp(x) - 1) +zeros = pl.expands(tile, 0.0) +pos_mask = pl.maximum(tile, zeros) # 正半部分 +exp_x = pl.exp(tile) # exp(x) +exp_minus_1 = pl.subs(exp_x, 1.0) # exp(x) - 1 +# 需要 select 操作来完整实现 +``` + +### 2.3 禁止的算子组合 + +1. **类型混用**: + ```python + # ✗ 错误: 不能直接对 tensor 使用 block 操作 + tile_result = pl.add(tensor_a, tensor_b) + + # ✓ 正确: 先 load 到 tile + tile_a = pl.load(tensor_a, offsets=[0, 0], shapes=[M, N]) + tile_b = pl.load(tensor_b, offsets=[0, 0], shapes=[M, N]) + tile_result = pl.add(tile_a, tile_b) + ``` + +2. **形状不匹配**: + ```python + # ✗ 错误: row_expand 操作需要 [M,1] 形状 + tile_a = [128, 128] + tile_b = [128, 64] # 错误形状 + result = pl.row_expand_div(tile_a, tile_b) + + # ✓ 正确: 使用 reshape 或正确的 load 形状 + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 1]) # [128,1] + result = pl.row_expand_div(tile_a, tile_b) + ``` + +3. **未处理的数值约束**: + ```python + # ✗ 错误: 可能除零 + result = pl.div(tile_a, tile_b) + + # ✓ 正确: 确保分母不为零 + tile_b_safe = pl.maximum(tile_b, pl.expands(tile_b, 0.01)) + result = pl.div(tile_a, tile_b_safe) + ``` + +## 3. Fuzzer 生成策略 + +### 3.1 操作符选择权重 + +基于实际硬件支持和测试价值,建议权重: + +- **高频操作** (权重 10): `add`, `mul`, `sub`, `maximum`, `adds`, `muls` +- **中频操作** (权重 5): `div`, `sqrt`, `exp`, `row_expand_*`, `matmul` +- **低频操作** (权重 2): `rsqrt`, `log`, `recip`, `transpose`, `reshape` +- **特殊操作** (权重 1): `cast`, `cmp`, reduction 操作 + +### 3.2 形状生成策略 + +支持的形状规格: +- **标准方形**: 32x32, 64x64, 96x96, 128x128, 256x256 +- **长方形**: 64x128, 128x64, 80x96, 96x80, 128x256 +- **向量形状**: Nx1, 1xN (用于 row/col expand) + +### 3.3 操作链生成规则 + +1. **长度范围**: 3-10 个操作 +2. **变量重用**: 每个中间结果至少使用一次 +3. **输入使用**: 所有输入必须至少被使用一次 +4. **类型一致性**: 操作链内保持 tile 类型 +5. **形状追踪**: 追踪每个变量的形状以确保兼容性 + +### 3.4 测试用例模板 + +```python +@pl.function(type=pl.FunctionType.InCore) +def kernel_func(self, a: pl.Tensor[[M, N], pl.FP32], + b: pl.Tensor[[M, 1], pl.FP32]) -> pl.Tensor[[M, N], pl.FP32]: + # Load tiles + tile_a = pl.load(a, offsets=[0, 0], shapes=[M, N]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[M, 1]) + + # Operation chain (fuzzer generated) + tmp_0 = pl.row_expand_div(tile_a, tile_b) + tmp_1 = pl.sqrt(tmp_0) + tmp_2 = pl.muls(tmp_1, 2.0) + # ... more operations + + return tmp_final +``` + +## 4. 参考示例 + +完整示例见: [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py) + +主要展示了: +- 如何使用 `row_expand_div` 操作 +- 如何处理不同形状的输入 +- 如何编写 `compute_expected` 参考实现 diff --git a/src/fuzzer/README.md b/src/fuzzer/README.md index e1b6dd0..1f0ee7d 100644 --- a/src/fuzzer/README.md +++ b/src/fuzzer/README.md @@ -6,8 +6,9 @@ ## 快速开始 +### 基础示例 (基本算子) ```bash -# 生成1个测试用例 +# 生成1个测试用例 (使用基础算子: add, mul, div, sqrt, exp等) python src/fuzzer/example_multi_kernel.py --num-cases 1 # 生成5个测试用例 @@ -20,6 +21,26 @@ pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only --save-kernels --kernels-dir=/tmp/kernels ``` +**说明**: 基础示例默认使用以下算子: +- 二元: add, sub, mul, div, maximum, minimum +- 标量: adds, subs, muls, divs +- 一元: sqrt, rsqrt, exp, neg, recip, log, abs, relu + +### 高级示例 (row_expand, matmul 等高级算子) +```bash +# 生成使用高级算子的测试用例 +python src/fuzzer/example_multi_kernel.py --num-cases 3 --enable-advanced-ops + +# 运行高级算子测试 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only +``` + +**高级算子包括**: +- Row expand: row_expand_add, row_expand_sub, row_expand_mul, row_expand_div +- Matrix: matmul + +**注意**: 使用 row_expand 算子时,请确保输入形状正确配置(第二个输入应为 [M, 1] 形状)。 + ## 目录结构 ``` @@ -40,30 +61,46 @@ src/fuzzer/ # 独立的模糊测试框架 ## Op 组合规则 +**详细规则文档**: 请参考 [OP_RULES.md](OP_RULES.md) 获取完整的算子规则和组合模式。 + ### 1. 操作符定义 -操作符在 [fuzzer.py](fuzzer.py) 的 `OpFuzzer.__init__` 方法中定义。 +操作符在 [src/fuzzer.py](src/fuzzer.py) 的 `OpFuzzer.__init__` 方法中定义。 **当前支持的操作**: -- **二元操作**: `block.add`, `block.sub`, `block.mul`, `block.div`, `block.maximum` +- **二元操作**: `block.add`, `block.sub`, `block.mul`, `block.div`, `block.maximum`, `block.minimum` - **标量操作**: `block.adds`, `block.subs`, `block.muls`, `block.divs` -- **一元操作**: `block.sqrt`, `block.rsqrt`, `block.exp`, `block.neg`, `block.recip` +- **一元操作**: `block.sqrt`, `block.rsqrt`, `block.exp`, `block.neg`, `block.recip`, `block.log`, `block.abs`, `block.relu` +- **行广播操作** (高级): `block.row_expand_add`, `block.row_expand_sub`, `block.row_expand_mul`, `block.row_expand_div` +- **矩阵操作** (高级): `block.matmul` -**添加新操作**: +**启用高级操作**: ```python -# 在 fuzzer.py 的 OpFuzzer.__init__ 中修改 -self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS +# 在生成器中启用高级操作 +from src.fuzzer.src.fuzzer import OpFuzzer -# 或者只使用基础操作 -basic_ops = [ - OpSpec("block.add", ["tile", "tile"], "tile", {}, lambda a, b: a + b), - OpSpec("block.sub", ["tile", "tile"], "tile", {}, lambda a, b: a - b), - OpSpec("block.mul", ["tile", "tile"], "tile", {}, lambda a, b: a * b), - OpSpec("block.div", ["tile", "tile"], "tile", {"avoid_zero": True}, lambda a, b: a / b), +# 启用行广播和矩阵操作 +fuzzer = OpFuzzer(seed=42, enable_advanced_ops=True) +``` + +**添加新操作**: +```python +# 在 fuzzer.py 中定义新操作 +CUSTOM_OPS = [ + OpSpec("block.custom_op", ["tile", "tile"], "tile", {}, lambda a, b: custom_numpy_impl(a, b)), ] -self.ops = basic_ops + +# 在 __init__ 中添加 +self.ops = self.ops + CUSTOM_OPS ``` +**操作符约束**: +- `avoid_zero`: 用于除法操作,确保分母不为零 +- `positive_only`: 用于 sqrt, log 等操作,确保输入为正数 +- `row_vec_shape`: 用于 row_expand 操作,要求第二个输入形状为 [M,1] + +更多详情请查看 [OP_RULES.md](OP_RULES.md) 中的完整算子列表和约束说明。 + ### 2. 内核生成规则 每个 InCore 内核包含: @@ -103,11 +140,11 @@ self.ops = basic_ops # 生成的内核代码 - 不同维度的输入 @pl.function(type=pl.FunctionType.InCore) def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[64, 64], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: - tile_a = pl.op.block.load(a, 0, 0, 128, 128) - tile_b = pl.op.block.load(b, 0, 0, 128, 128) # 加载到输出大小 - tmp_0 = pl.op.block.add(tile_b, tile_a) # 操作1: b + a - tmp_1 = pl.op.block.mul(tmp_0, tile_a) # 操作2: tmp_0 * a - tmp_2 = pl.op.block.sub(tmp_1, tile_b) # 操作3: tmp_1 - b + tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) # 加载到输出大小 + tmp_0 = pl.add(tile_b, tile_a) # 操作1: b + a + tmp_1 = pl.mul(tmp_0, tile_a) # 操作2: tmp_0 * a + tmp_2 = pl.sub(tmp_1, tile_b) # 操作3: tmp_1 - b return tmp_2 ``` @@ -162,6 +199,36 @@ OpSpec( - `param_generator`: 参数生成函数(可选) - `requires_params`: 是否需要参数 +### 5. 常见算子组合模式 + +参考 [OP_RULES.md](OP_RULES.md) 第 2.2 节获取完整的算子组合模式,包括: + +#### Softmax 模式 +```python +# 1. Row max reduction +max_vals = pl.row_max(tile, tmp_tile) +# 2. Subtract max for numerical stability +centered = pl.row_expand_sub(tile, max_vals) +# 3. Exponential +exp_vals = pl.exp(centered) +# 4. Row sum +sum_vals = pl.row_sum(exp_vals, tmp_tile) +# 5. Normalize +output = pl.row_expand_div(exp_vals, sum_vals) +``` + +#### ReLU 及变体 +```python +# ReLU +output = pl.relu(tile) + +# LeakyReLU (alpha=0.01) +neg_part = pl.muls(tile, 0.01) +output = pl.maximum(tile, neg_part) +``` + +更多模式请参考 [OP_RULES.md](OP_RULES.md)。 + ## 命令行参数 ### 生成测试用例 @@ -250,11 +317,21 @@ self.ops = custom_ops ## 注意事项 -1. **张量形状**: 支持不同维度的输入张量,可以在配置中指定每个内核的输入形状 -2. **数据类型**: 当前仅支持 FP32 类型 -3. **操作约束**: 框架自动处理除零、负数开方等约束 -4. **ISA 支持**: 确保添加的操作在目标硬件的 ISA 中有对应实现 -5. **输入数量**: 每个内核支持 1-3 个输入张量,可以在配置中指定 +1. **32字节对齐约束**: 所有 tensor 创建和 reshape 操作的形状必须满足32字节对齐 + - 形状尾轴(列数)必须是 1,或 `(cols * sizeof(dtype)) % 32 == 0` + - FP32 类型有效的列数: 1, 8, 16, 24, 32, 40, 48, 56, 64, ..., 128, ... + - Fuzzer 会自动验证并修正不对齐的形状 + - 详见 [OP_RULES.md](OP_RULES.md) 第 0 节 + +2. **张量形状**: 支持不同维度的输入张量,可以在配置中指定每个内核的输入形状 + +3. **数据类型**: 当前仅支持 FP32 类型 + +4. **操作约束**: 框架自动处理除零、负数开方等约束 + +5. **ISA 支持**: 确保添加的操作在目标硬件的 ISA 中有对应实现 + +6. **输入数量**: 每个内核支持 1-3 个输入张量,可以在配置中指定 ## 参考文件 diff --git a/src/fuzzer/example_multi_kernel.py b/src/fuzzer/example_multi_kernel.py index d3fac95..6bf90b1 100644 --- a/src/fuzzer/example_multi_kernel.py +++ b/src/fuzzer/example_multi_kernel.py @@ -61,6 +61,12 @@ def main(): help="随机种子,用于可重现性 (默认: 42)" ) + parser.add_argument( + "--enable-advanced-ops", + action="store_true", + help="启用高级算子 (row_expand, matmul等)" + ) + args = parser.parse_args() # 设置输出路径 @@ -73,6 +79,7 @@ def main(): print(f"=" * 60) print(f"测试用例数量: {args.num_cases}") print(f"随机种子: {args.seed}") + print(f"启用高级算子: {'是' if args.enable_advanced_ops else '否'}") print(f"输出文件: {output_path}") print(f"=" * 60) print() @@ -86,10 +93,10 @@ def main(): "shape": (128, 128), "num_ops_range": (3, 5), "input_shapes_list": [ - [(128, 128), (64, 64)], # kernel_0: 2个不同维度的输入 - [(128, 128), (128, 128), (256, 256)], # kernel_1: 3个不同维度的输入 + [(128, 128), (128, 128)], # kernel_0: 2个相同维度的输入 + [(128, 128), (128, 128), (128, 128)], # kernel_1: 3个相同维度的输入 ], - "description": "简单顺序执行:2个内核,不同维度输入" + "description": "简单顺序执行:2个内核,相同维度输入" }, { "name": "fuzz_branching_parallel", @@ -99,10 +106,10 @@ def main(): "num_ops_range": (4, 6), "input_shapes_list": [ [(128, 128), (128, 128)], # kernel_0: 2个相同维度 - [(64, 64), (128, 128)], # kernel_1: 2个不同维度 - [(256, 256)], # kernel_2: 1个输入 + [(128, 128), (128, 128)], # kernel_1: 2个相同维度 + [(128, 128)], # kernel_2: 1个输入 ], - "description": "分支并行执行:3个内核,不同输入数量" + "description": "分支并行执行:3个内核,相同维度输入" }, { "name": "fuzz_mixed_complex", @@ -129,12 +136,12 @@ def main(): "shape": (128, 128), "num_ops_range": (4, 7), "input_shapes_list": [ - [(128, 128), (64, 64), (256, 256)], # kernel_0: 3个不同维度 - [(128, 128)], # kernel_1: 1个输入 - [(64, 64), (64, 64)], # kernel_2: 2个相同维度 - [(256, 256), (128, 128)], # kernel_3: 2个不同维度 + [(128, 128), (128, 128), (128, 128)], # kernel_0: 3个相同维度 + [(128, 128)], # kernel_1: 1个输入 + [(128, 128), (128, 128)], # kernel_2: 2个相同维度 + [(128, 128), (128, 128)], # kernel_3: 2个相同维度 ], - "description": "宽分支执行:4个内核,多样化输入配置" + "description": "宽分支执行:4个内核,统一维度输入" }, ] @@ -149,7 +156,7 @@ def main(): print() # 创建生成器 - generator = MultiKernelTestGenerator(seed=args.seed) + generator = MultiKernelTestGenerator(seed=args.seed, enable_advanced_ops=args.enable_advanced_ops) # 生成测试文件 print("正在生成测试文件...") diff --git a/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py index 0d5c1bf..0046f07 100644 --- a/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py +++ b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py @@ -40,7 +40,7 @@ def define_tensors(self) -> List[TensorSpec]: return [ TensorSpec('a', [128, 128], DataType.FP32, init_value=2.0), TensorSpec('b', [128, 128], DataType.FP32, init_value=2.5), - TensorSpec('c', [256, 256], DataType.FP32, init_value=3.0), + TensorSpec('c', [128, 128], DataType.FP32, init_value=3.0), TensorSpec('output', [128, 128], DataType.FP32, is_output=True), ] @@ -50,28 +50,31 @@ def get_program(self) -> Any: @pl.program class FuzzSequentialSimpleProgram: @pl.function(type=pl.FunctionType.InCore) - def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: - tile_a = pl.op.block.load(a, 0, 0, 128, 128) - tile_b = pl.op.block.load(b, 0, 0, 128, 128) - tmp_0 = pl.op.block.div(tile_b, tile_a) - tmp_1 = pl.op.block.sub(tmp_0, tile_a) - tmp_2 = pl.op.block.div(tmp_1, tile_b) - return tmp_2 + def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], output: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) + tmp_0 = pl.subs(tile_b, 1.0) + tmp_1 = pl.mul(tile_a, tile_a) + tmp_2 = pl.subs(tmp_1, 1.0) + tmp_3 = pl.add(tmp_0, tmp_2) + result = pl.store(tmp_3, offsets=[0, 0], shapes=[128, 128], output_tensor=output) + return result @pl.function(type=pl.FunctionType.InCore) - def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[256, 256], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: - tile_a = pl.op.block.load(a, 0, 0, 128, 128) - tile_b = pl.op.block.load(b, 0, 0, 128, 128) - tile_c = pl.op.block.load(c, 0, 0, 128, 128) - tmp_0 = pl.op.block.add(tile_a, tile_c) - tmp_1 = pl.op.block.neg(tile_b) - tmp_2 = pl.op.block.maximum(tmp_1, tmp_1) - tmp_3 = pl.op.block.rsqrt(tmp_0) - tmp_4 = pl.op.block.add(tmp_2, tmp_3) - return tmp_4 + def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[128, 128], pl.FP32], output: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) + tile_c = pl.load(c, offsets=[0, 0], shapes=[128, 128]) + tmp_0 = pl.add(tile_a, tile_c) + tmp_1 = pl.muls(tile_b, 0.5) + tmp_2 = pl.rsqrt(tmp_1) + tmp_3 = pl.exp(tmp_0) + tmp_4 = pl.add(tmp_2, tmp_3) + result = pl.store(tmp_4, offsets=[0, 0], shapes=[128, 128], output_tensor=output) + return result @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[256, 256], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: result_0 = self.kernel_0(a, b) result_1 = self.kernel_1(result_0, b, c) return result_1 @@ -80,44 +83,41 @@ def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128 def compute_expected(self, tensors, params=None): """使用 NumPy 计算期望输出""" - def _numpy_kernel_0(self, a, b): - """NumPy 实现: kernel_0""" - # 创建变量环境 - env = {} - env['tile_a'] = a.copy() - env['tile_b'] = b.copy() - - # 执行操作链 - env['tile_b'] = np.where(np.abs(env['tile_b']) < 0.01, 1.0, env['tile_b']) - env['tile_a'] = np.where(np.abs(env['tile_a']) < 0.01, 1.0, env['tile_a']) - env['tmp_0'] = env['tile_b'] / env['tile_a'] - env['tmp_1'] = env['tmp_0'] - env['tile_a'] - env['tmp_1'] = np.where(np.abs(env['tmp_1']) < 0.01, 1.0, env['tmp_1']) - env['tile_b'] = np.where(np.abs(env['tile_b']) < 0.01, 1.0, env['tile_b']) - env['tmp_2'] = env['tmp_1'] / env['tile_b'] - return env['tmp_2'] - - def _numpy_kernel_1(self, a, b, c): - """NumPy 实现: kernel_1""" - # 创建变量环境 - env = {} - env['tile_a'] = a.copy() - env['tile_b'] = b.copy() - env['tile_c'] = c.copy() - - # 执行操作链 - env['tmp_0'] = env['tile_a'] + env['tile_c'] - env['tmp_1'] = -env['tile_b'] - env['tmp_2'] = np.maximum(env['tmp_1'], env['tmp_1']) - env['tmp_0'] = np.abs(env['tmp_0']) + 1e-6 - env['tmp_3'] = 1.0 / np.sqrt(env['tmp_0']) - env['tmp_4'] = env['tmp_2'] + env['tmp_3'] - return env['tmp_4'] + def _numpy_kernel_0(a, b): + """NumPy 实现: kernel_0""" + # 创建变量环境 + env = {} + env['tile_a'] = a.copy() + env['tile_b'] = b.copy() + + # 执行操作链 + env['tmp_0'] = env['tile_b'] - 1.0 + env['tmp_1'] = env['tile_a'] * env['tile_a'] + env['tmp_2'] = env['tmp_1'] - 1.0 + env['tmp_3'] = env['tmp_0'] + env['tmp_2'] + return env['tmp_3'] + + def _numpy_kernel_1(a, b, c): + """NumPy 实现: kernel_1""" + # 创建变量环境 + env = {} + env['tile_a'] = a.copy() + env['tile_b'] = b.copy() + env['tile_c'] = c.copy() + + # 执行操作链 + env['tmp_0'] = env['tile_a'] + env['tile_c'] + env['tmp_1'] = env['tile_b'] * 0.5 + env['tmp_1'] = np.abs(env['tmp_1']) + 1e-6 + env['tmp_2'] = 1.0 / np.sqrt(env['tmp_1']) + env['tmp_3'] = np.exp(np.clip(env['tmp_0'], -10, 10)) + env['tmp_4'] = env['tmp_2'] + env['tmp_3'] + return env['tmp_4'] # 顺序执行模式 - result_0 = self._numpy_kernel_0(tensors['a'], tensors['b']) - result_1 = self._numpy_kernel_1(result_0, tensors['b'], tensors['c']) + result_0 = _numpy_kernel_0(tensors['a'], tensors['b']) + result_1 = _numpy_kernel_1(result_0, tensors['b'], tensors['c']) tensors['output'][:] = result_1 diff --git a/src/fuzzer/src/fuzzer.py b/src/fuzzer/src/fuzzer.py index 4e5b3bd..06c02a9 100644 --- a/src/fuzzer/src/fuzzer.py +++ b/src/fuzzer/src/fuzzer.py @@ -9,6 +9,97 @@ import numpy as np # Used in lambda functions for op equivalents +# 数据类型字节大小 +DTYPE_SIZES = { + "FP32": 4, + "FP16": 2, + "INT8": 1, + "INT32": 4, +} + + +def is_shape_aligned(shape: Tuple[int, int], dtype: str = "FP32") -> bool: + """检查形状是否满足32字节对齐约束 + + Args: + shape: (rows, cols) 形状元组 + dtype: 数据类型 (默认 FP32) + + Returns: + True 如果形状满足对齐要求 + + 规则: + - 尾轴 (cols) 必须是 1, 或者 + - (尾轴 * sizeof(dtype)) 必须是 32 的倍数 + + 示例 (FP32, sizeof=4): + - (128, 1) ✓ 尾轴=1 + - (128, 8) ✓ 8*4=32, 对齐 + - (128, 16) ✓ 16*4=64, 对齐 + - (128, 32) ✓ 32*4=128, 对齐 + - (128, 5) ✗ 5*4=20, 不对齐 + """ + rows, cols = shape + dtype_size = DTYPE_SIZES.get(dtype, 4) + + # 尾轴是1,总是对齐 + if cols == 1: + return True + + # 检查 (尾轴 * sizeof(dtype)) 是否是32的倍数 + return (cols * dtype_size) % 32 == 0 + + +def get_aligned_shapes(dtype: str = "FP32", max_size: int = 128) -> List[Tuple[int, int]]: + """获取所有满足对齐约束的常用形状 + + Args: + dtype: 数据类型 (默认 FP32) + max_size: 最大维度大小 (默认 128,避免内存溢出) + + Returns: + 对齐的形状列表 + """ + dtype_size = DTYPE_SIZES.get(dtype, 4) + # 计算最小对齐的列数 (除了1) + min_aligned_cols = 32 // dtype_size # FP32: 8, FP16: 16, INT8: 32 + + aligned_shapes = [] + + # 常用行数 - 限制最大为 max_size + common_rows = [32, 64, 80, 96, 128] + common_rows = [r for r in common_rows if r <= max_size] + + # 对齐的列数: 1, min_aligned_cols, 2*min_aligned_cols, ... + for rows in common_rows: + # 列数为1的情况 + aligned_shapes.append((rows, 1)) + + # 对齐的列数 + max_multiplier = max_size // min_aligned_cols + for multiplier in range(1, max_multiplier + 1): + cols = min_aligned_cols * multiplier + if cols <= max_size: + aligned_shapes.append((rows, cols)) + + return aligned_shapes + + +def generate_aligned_shape(rng, dtype: str = "FP32", max_size: int = 128) -> Tuple[int, int]: + """随机生成一个对齐的形状 + + Args: + rng: 随机数生成器 + dtype: 数据类型 + max_size: 最大维度大小 (默认 128,避免内存溢出) + + Returns: + 满足对齐约束的形状元组 + """ + aligned_shapes = get_aligned_shapes(dtype, max_size) + return rng.choice(aligned_shapes) if aligned_shapes else (128, 128) + + @dataclass class OpSpec: """Operator specification for fuzzing. @@ -60,6 +151,7 @@ class OpFuzzer: OpSpec("block.mul", ["tile", "tile"], "tile", {}, lambda a, b: a * b), OpSpec("block.div", ["tile", "tile"], "tile", {"avoid_zero": True}, lambda a, b: a / b), OpSpec("block.maximum", ["tile", "tile"], "tile", {}, lambda a, b: np.maximum(a, b)), + OpSpec("block.minimum", ["tile", "tile"], "tile", {}, lambda a, b: np.minimum(a, b)), ] # Block-level scalar operators @@ -77,14 +169,53 @@ class OpFuzzer: OpSpec("block.exp", ["tile"], "tile", {}, lambda a: np.exp(np.clip(a, -10, 10))), OpSpec("block.neg", ["tile"], "tile", {}, lambda a: -a), OpSpec("block.recip", ["tile"], "tile", {"avoid_zero": True}, lambda a: 1.0 / a), + OpSpec("block.log", ["tile"], "tile", {"positive_only": True}, lambda a: np.log(a)), + OpSpec("block.abs", ["tile"], "tile", {}, lambda a: np.abs(a)), + OpSpec("block.relu", ["tile"], "tile", {}, lambda a: np.maximum(0, a)), + ] + + # Block-level row expand operators (需要特殊的形状处理) + # 注意: 这些操作需要第二个输入是 [M, 1] 形状 + BLOCK_ROW_EXPAND_OPS = [ + OpSpec("block.row_expand_add", ["tile", "tile"], "tile", {"row_vec_required": True}, + lambda a, b: a + b), # b is [M,1], broadcasts to [M,N] + OpSpec("block.row_expand_sub", ["tile", "tile"], "tile", {"row_vec_required": True}, + lambda a, b: a - b), + OpSpec("block.row_expand_mul", ["tile", "tile"], "tile", {"row_vec_required": True}, + lambda a, b: a * b), + OpSpec("block.row_expand_div", ["tile", "tile"], "tile", {"row_vec_required": True, "avoid_zero": True}, + lambda a, b: a / b), ] - def __init__(self, seed: Optional[int] = None): - """Initialize fuzzer with optional seed for reproducibility.""" + # Block-level reduction operators (改变形状) + # axis=1: 沿最后一个轴归约, [M,N] -> [M,1] + BLOCK_REDUCTION_OPS = [ + # 注意: row_sum, row_max, row_min 需要一个临时tile参数 + # 为了简化,这里先不包含它们,或者使用 sum/max/min with axis参数 + ] + + # Block-level matrix operators + BLOCK_MATRIX_OPS = [ + OpSpec("block.matmul", ["tile", "tile"], "tile", {"matmul_shape": True}, + lambda a, b: a @ b, + shape_transform=lambda shapes, params=None: (shapes[0][0], shapes[1][1]) if len(shapes) >= 2 else shapes[0]), + ] + + def __init__(self, seed: Optional[int] = None, enable_advanced_ops: bool = False): + """Initialize fuzzer with optional seed for reproducibility. + + Args: + seed: Random seed for reproducibility + enable_advanced_ops: Enable advanced operations like row_expand, matmul (default: False) + """ self.rng = random.Random(seed) - # 使用所有操作符 + # 基础操作符集合 self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS + # 可选: 启用高级操作符 + if enable_advanced_ops: + self.ops = self.ops + self.BLOCK_ROW_EXPAND_OPS + self.BLOCK_MATRIX_OPS + def generate_op_chain( self, num_ops: int = 5, diff --git a/src/fuzzer/src/kernel_generator.py b/src/fuzzer/src/kernel_generator.py index 9a04870..160bbd0 100644 --- a/src/fuzzer/src/kernel_generator.py +++ b/src/fuzzer/src/kernel_generator.py @@ -8,20 +8,21 @@ import random from typing import Any, Dict, List, Optional, Tuple -from .fuzzer import OpFuzzer +from .fuzzer import OpFuzzer, is_shape_aligned, generate_aligned_shape class KernelGenerator: """生成 InCore 内核函数的生成器""" - def __init__(self, seed: Optional[int] = None): + def __init__(self, seed: Optional[int] = None, enable_advanced_ops: bool = False): """初始化内核生成器 Args: seed: 随机种子,用于可重现性 + enable_advanced_ops: 启用高级算子(row_expand, matmul等) """ self.rng = random.Random(seed) - self.fuzzer = OpFuzzer(seed=seed) + self.fuzzer = OpFuzzer(seed=seed, enable_advanced_ops=enable_advanced_ops) def generate_kernel( self, @@ -60,25 +61,36 @@ def generate_kernel( actual_num_inputs = num_inputs actual_shapes = [shape] * num_inputs + # 验证所有形状是否满足对齐约束 + dtype = "FP32" # 当前仅支持 FP32 + for i, input_shape in enumerate(actual_shapes): + if not is_shape_aligned(input_shape, dtype): + # 如果形状不对齐,使用最接近的对齐形状 + print(f"Warning: Input shape {input_shape} is not 32-byte aligned. Regenerating aligned shape.") + actual_shapes[i] = generate_aligned_shape(self.rng, dtype) + + # 确定输出形状并验证对齐 + if output_shape is not None: + actual_output_shape = output_shape + if not is_shape_aligned(actual_output_shape, dtype): + print(f"Warning: Output shape {actual_output_shape} is not 32-byte aligned. Regenerating aligned shape.") + actual_output_shape = generate_aligned_shape(self.rng, dtype) + else: + actual_output_shape = actual_shapes[0] + # 生成操作链 op_chain = self.fuzzer.generate_op_chain( num_ops=num_ops, input_count=actual_num_inputs, allow_scalars=allow_scalars, track_shapes=False, - default_shape=shape, + default_shape=actual_output_shape, ) # 生成输入参数 input_names = [chr(97 + i) for i in range(actual_num_inputs)] # a, b, c, ... inputs = [(name, actual_shapes[i]) for i, name in enumerate(input_names)] - # 确定输出形状:优先使用指定的 output_shape,否则使用第一个输入的形状 - if output_shape is not None: - actual_output_shape = output_shape - else: - actual_output_shape = actual_shapes[0] - # 生成内核代码 code = self._generate_kernel_code( kernel_name=kernel_name, @@ -115,10 +127,12 @@ def _generate_kernel_code( """ rows, cols = output_shape - # 生成函数签名 + # 生成函数签名 - 添加 output_tensor 参数 params = [] for name, (r, c) in inputs: params.append(f"{name}: pl.Tensor[[{r}, {c}], pl.FP32]") + # 添加 output_tensor 参数 + params.append(f"output: pl.Tensor[[{rows}, {cols}], pl.FP32]") code_lines = [ f" @pl.function(type=pl.FunctionType.InCore)", @@ -127,7 +141,7 @@ def _generate_kernel_code( # 加载输入张量 - 使用输出形状作为加载大小 for name, (r, c) in inputs: - code_lines.append(f" tile_{name} = pl.op.block.load({name}, 0, 0, {rows}, {cols})") + code_lines.append(f" tile_{name} = pl.load({name}, offsets=[0, 0], shapes=[{rows}, {cols}])") # 生成操作链 for op_dict in op_chain: @@ -136,20 +150,25 @@ def _generate_kernel_code( output = op_dict["output"] params = op_dict.get("params") + # 去掉 block. 前缀,直接使用 pl.xxx + op_name = op.name.replace("block.", "") + if params: params_str = ", ".join(f"{k}={v}" for k, v in params.items()) - code_lines.append(f" {output} = pl.op.{op.name}({inputs_str}, {params_str})") + code_lines.append(f" {output} = pl.{op_name}({inputs_str}, {params_str})") else: - code_lines.append(f" {output} = pl.op.{op.name}({inputs_str})") + code_lines.append(f" {output} = pl.{op_name}({inputs_str})") - # 返回最终结果 + # Store 结果并返回 if op_chain: last_output = op_chain[-1]["output"] - code_lines.append(f" return {last_output}") + code_lines.append(f" result = pl.store({last_output}, offsets=[0, 0], shapes=[{rows}, {cols}], output_tensor=output)") + code_lines.append(f" return result") else: - # 如果没有操作,返回第一个输入 + # 如果没有操作,直接 store 第一个输入 first_input = inputs[0][0] - code_lines.append(f" return tile_{first_input}") + code_lines.append(f" result = pl.store(tile_{first_input}, offsets=[0, 0], shapes=[{rows}, {cols}], output_tensor=output)") + code_lines.append(f" return result") return "\n".join(code_lines) diff --git a/src/fuzzer/src/multi_kernel_test_generator.py b/src/fuzzer/src/multi_kernel_test_generator.py index 1c142bf..bb43e51 100644 --- a/src/fuzzer/src/multi_kernel_test_generator.py +++ b/src/fuzzer/src/multi_kernel_test_generator.py @@ -21,16 +21,18 @@ class MultiKernelTestGenerator: """生成多内核测试用例的生成器""" - def __init__(self, seed: Optional[int] = None): + def __init__(self, seed: Optional[int] = None, enable_advanced_ops: bool = False): """初始化测试生成器 Args: seed: 随机种子,用于可重现性 + enable_advanced_ops: 启用高级算子(row_expand, matmul等) """ self.seed = seed - self.kernel_gen = KernelGenerator(seed=seed) + self.enable_advanced_ops = enable_advanced_ops + self.kernel_gen = KernelGenerator(seed=seed, enable_advanced_ops=enable_advanced_ops) self.orch_gen = OrchestratorGenerator(seed=seed) - self.fuzzer = OpFuzzer(seed=seed) + self.fuzzer = OpFuzzer(seed=seed, enable_advanced_ops=enable_advanced_ops) def _compute_output_shapes_for_sequential( self, @@ -149,15 +151,18 @@ def _regenerate_kernel_code_with_unified_shapes( for inp_name, _ in kernel["inputs"]: unified_shape = input_shapes_map[inp_name] params.append(f"{inp_name}: pl.Tensor[[{unified_shape[0]}, {unified_shape[1]}], pl.FP32]") + # 添加 output_tensor 参数 + params.append(f"output: pl.Tensor[[{rows}, {cols}], pl.FP32]") code_lines = [ f" @pl.function(type=pl.FunctionType.InCore)", f" def {kernel_name}(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", ] - # 加载输入张量 - 使用输出形状作为加载大小 + # 加载输入张量 - 使用每个输入的实际定义形状 for inp_name, _ in kernel["inputs"]: - code_lines.append(f" tile_{inp_name} = pl.op.block.load({inp_name}, 0, 0, {rows}, {cols})") + inp_shape = input_shapes_map[inp_name] + code_lines.append(f" tile_{inp_name} = pl.load({inp_name}, offsets=[0, 0], shapes=[{inp_shape[0]}, {inp_shape[1]}])") # 生成操作链 for op_dict in op_chain: @@ -166,20 +171,25 @@ def _regenerate_kernel_code_with_unified_shapes( output = op_dict["output"] params_dict = op_dict.get("params") + # 去掉 block. 前缀,直接使用 pl.xxx + op_name = op.name.replace("block.", "") + if params_dict: params_str = ", ".join(f"{k}={v}" for k, v in params_dict.items()) - code_lines.append(f" {output} = pl.op.{op.name}({inputs_str}, {params_str})") + code_lines.append(f" {output} = pl.{op_name}({inputs_str}, {params_str})") else: - code_lines.append(f" {output} = pl.op.{op.name}({inputs_str})") + code_lines.append(f" {output} = pl.{op_name}({inputs_str})") - # 返回最终结果 + # Store 结果并返回 if op_chain: last_output = op_chain[-1]["output"] - code_lines.append(f" return {last_output}") + code_lines.append(f" result = pl.store({last_output}, offsets=[0, 0], shapes=[{rows}, {cols}], output_tensor=output)") + code_lines.append(f" return result") else: - # 如果没有操作,返回第一个输入 + # 如果没有操作,直接 store 第一个输入 first_input = kernel["inputs"][0][0] - code_lines.append(f" return tile_{first_input}") + code_lines.append(f" result = pl.store(tile_{first_input}, offsets=[0, 0], shapes=[{rows}, {cols}], output_tensor=output)") + code_lines.append(f" return result") return "\n".join(code_lines) @@ -269,7 +279,8 @@ def _generate_numpy_reference( input_names = [inp[0] for inp in kernel["inputs"]] op_chain = kernel["op_chain"] - code_lines.append(f" def _numpy_{kernel_name}(self, {', '.join(input_names)}):") + # 嵌套函数不需要 self 参数 + code_lines.append(f" def _numpy_{kernel_name}({', '.join(input_names)}):") code_lines.append(f" \"\"\"NumPy 实现: {kernel_name}\"\"\"") # 生成 NumPy 操作 @@ -325,6 +336,7 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: NumPy 操作表达式字符串 """ # 根据操作类型生成表达式 + # 二元操作 if op_name == "block.add": return f"{input_vals[0]} + {input_vals[1]}" elif op_name == "block.sub": @@ -335,6 +347,9 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: return f"{input_vals[0]} / {input_vals[1]}" elif op_name == "block.maximum": return f"np.maximum({input_vals[0]}, {input_vals[1]})" + elif op_name == "block.minimum": + return f"np.minimum({input_vals[0]}, {input_vals[1]})" + # 标量操作 elif op_name == "block.adds": return f"{input_vals[0]} + {input_vals[1]}" elif op_name == "block.subs": @@ -343,6 +358,7 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: return f"{input_vals[0]} * {input_vals[1]}" elif op_name == "block.divs": return f"{input_vals[0]} / {input_vals[1]}" + # 一元操作 elif op_name == "block.sqrt": return f"np.sqrt({input_vals[0]})" elif op_name == "block.rsqrt": @@ -353,6 +369,24 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: return f"-{input_vals[0]}" elif op_name == "block.recip": return f"1.0 / {input_vals[0]}" + elif op_name == "block.log": + return f"np.log({input_vals[0]})" + elif op_name == "block.abs": + return f"np.abs({input_vals[0]})" + elif op_name == "block.relu": + return f"np.maximum(0, {input_vals[0]})" + # Row expand 操作 + elif op_name == "block.row_expand_add": + return f"{input_vals[0]} + {input_vals[1]}" # Broadcasting + elif op_name == "block.row_expand_sub": + return f"{input_vals[0]} - {input_vals[1]}" + elif op_name == "block.row_expand_mul": + return f"{input_vals[0]} * {input_vals[1]}" + elif op_name == "block.row_expand_div": + return f"{input_vals[0]} / {input_vals[1]}" + # 矩阵操作 + elif op_name == "block.matmul": + return f"{input_vals[0]} @ {input_vals[1]}" else: return f"# 未知操作: {op_name}" @@ -466,7 +500,13 @@ def _generate_test_class( # 添加 NumPy 参考实现 code_lines.append(f" def compute_expected(self, tensors, params=None):") code_lines.append(f" \"\"\"使用 NumPy 计算期望输出\"\"\"") - code_lines.append(numpy_code) + # numpy_code 包含嵌套函数定义,需要添加到 compute_expected 内部,所以需要额外缩进 + numpy_lines = numpy_code.split('\n') + for line in numpy_lines: + if line.strip(): # 跳过空行 + code_lines.append(f" {line}") # 添加额外的4个空格缩进 + else: + code_lines.append(line) code_lines.append(f"") # 根据组合模式生成计算逻辑 @@ -490,7 +530,8 @@ def _generate_test_class( inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) result_var = f"result_{i}" - code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + # 调用嵌套函数不需要 self + code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") code_lines.append(f" tensors['output'][:] = {result_var}") @@ -504,7 +545,8 @@ def _generate_test_class( branch_results.append(result_var) inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) - code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + # 调用嵌套函数不需要 self + code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") # 合并结果 if len(branch_results) == 1: @@ -532,7 +574,8 @@ def _generate_test_class( branch_results.append(result_var) inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) - code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + # 调用嵌套函数不需要 self + code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") # 合并并行结果 if len(branch_results) > 1: @@ -557,7 +600,8 @@ def _generate_test_class( for inp in kernel_inputs[1:]: inputs_parts.append(f"tensors['{inp}']") inputs_str = ", ".join(inputs_parts) - code_lines.append(f" {result_var} = self._numpy_{kernel_name}({inputs_str})") + # 调用嵌套函数不需要 self + code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") current_result = result_var code_lines.append(f" tensors['output'][:] = {current_result}") diff --git a/src/fuzzer/src/orchestrator_generator.py b/src/fuzzer/src/orchestrator_generator.py index 8d09fd0..dddb1ad 100644 --- a/src/fuzzer/src/orchestrator_generator.py +++ b/src/fuzzer/src/orchestrator_generator.py @@ -11,6 +11,8 @@ import random from typing import Any, Dict, List, Optional, Tuple +from .fuzzer import is_shape_aligned + class OrchestratorGenerator: """生成 Orchestration 组合函数的生成器""" @@ -71,7 +73,7 @@ def generate_sequential( f" def orchestrator(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", ] - # 顺序调用内核 + # 顺序调用内核 - 不需要显式创建 tensor result_var = None for i, kernel in enumerate(kernels): kernel_name = kernel["name"] @@ -82,6 +84,7 @@ def generate_sequential( # 替换第一个输入为前一个内核的输出 kernel_inputs[0] = result_var + # 调用 InCore 函数,框架会自动处理输出 tensor result_var = f"result_{i}" inputs_str = ", ".join(kernel_inputs) code_lines.append(f" {result_var} = self.{kernel_name}({inputs_str})") @@ -145,7 +148,7 @@ def generate_branching( f" def orchestrator(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", ] - # 并行执行所有内核 + # 并行执行所有内核 - 不需要显式创建 tensor result_vars = [] for i, kernel in enumerate(kernels): kernel_name = kernel["name"] @@ -231,7 +234,7 @@ def generate_mixed( parallel_kernels = kernels[:mid] sequential_kernels = kernels[mid:] - # 并行执行前半部分 + # 并行执行前半部分 - 不需要显式创建 tensor branch_results = [] for i, kernel in enumerate(parallel_kernels): kernel_name = kernel["name"] @@ -290,9 +293,11 @@ def generate_merge_kernel(self, shape: Tuple[int, int] = (128, 128)) -> str: rows, cols = shape code = f""" @pl.function(type=pl.FunctionType.InCore) def merge_results(self, a: pl.Tensor[[{rows}, {cols}], pl.FP32], - b: pl.Tensor[[{rows}, {cols}], pl.FP32]) -> pl.Tensor[[{rows}, {cols}], pl.FP32]: - tile_a = pl.op.block.load(a, 0, 0, {rows}, {cols}) - tile_b = pl.op.block.load(b, 0, 0, {rows}, {cols}) - result = pl.op.block.add(tile_a, tile_b) + b: pl.Tensor[[{rows}, {cols}], pl.FP32], + output: pl.Tensor[[{rows}, {cols}], pl.FP32]) -> pl.Tensor[[{rows}, {cols}], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[{rows}, {cols}]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[{rows}, {cols}]) + result_tile = pl.add(tile_a, tile_b) + result = pl.store(result_tile, offsets=[0, 0], shapes=[{rows}, {cols}], output_tensor=output) return result""" return code diff --git a/tests/test_cases/test_expand.py b/tests/test_cases/test_expand.py new file mode 100644 index 0000000..4e14344 --- /dev/null +++ b/tests/test_cases/test_expand.py @@ -0,0 +1,404 @@ +""" +Tests for row_expand_div operation using PyPTO frontend. + +This test demonstrates the row_expand_div operation which expands a row vector +and performs element-wise division with a matrix. +""" + +import sys +from pathlib import Path +from typing import Any, List + +import numpy as np +import pytest + +from pto_test.core import environment +from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec + +# Add pypto to path +_PYPTO_PYTHON = environment.get_pypto_python_path() +if _PYPTO_PYTHON is not None and _PYPTO_PYTHON.exists() and str(_PYPTO_PYTHON) not in sys.path: + sys.path.insert(0, str(_PYPTO_PYTHON)) + + +class TestRowExpandDivBase(PTOTestCase): + """Base test case for row_expand_div operation. + + This operation takes a matrix and a column vector, and divides each row + of the matrix by the corresponding scalar value from the column vector. + + For example: + - Matrix a: [[6, 8], [12, 16]] + - Column vector b: [[2], [4]] + - Result c: [[6/2, 8/2], [12/4, 16/4]] = [[3, 4], [3, 4]] + + Note: PyPTO requires shape dimensions to be compile-time constants in type + annotations, so each shape needs its own subclass with get_program() method. + """ + + # Subclasses must define these + ROWS = 128 + COLS = 128 + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.rows = self.ROWS + self.cols = self.COLS + + def get_name(self) -> str: + return f"row_expand_div_{self.rows}x{self.cols}" + + def define_tensors(self) -> List[TensorSpec]: + return [ + # Matrix to be divided (random values) + TensorSpec("a", [self.rows, self.cols], DataType.FP32, + init_value=lambda shape: np.random.rand(*shape).astype(np.float32)), + # Column vector (divisor) - shape is [rows, 1] (random values, avoid division by zero) + TensorSpec("b", [self.rows, 1], DataType.FP32, + init_value=lambda shape: (np.random.rand(*shape) + 0.1).astype(np.float32)), + # Output tensor + TensorSpec("c", [self.rows, self.cols], DataType.FP32, is_output=True), + ] + + def compute_expected(self, tensors, params=None): + """Compute expected output: each row of a divided by corresponding scalar in b.""" + # Broadcasting: a[rows, cols] / b[rows, 1] -> c[rows, cols] + tensors["c"][:] = tensors["a"] / tensors["b"] + + +# Generate test classes for different shapes +class TestRowExpandDiv_32x32(TestRowExpandDivBase): + ROWS = 32 + COLS = 32 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[32, 32], pl.FP32], + b: pl.Tensor[[1, 32], pl.FP32], + c: pl.Tensor[[32, 32], pl.FP32], + ) -> pl.Tensor[[32, 32], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[32, 32]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[1, 32]) + + tile_b_reshaped = pl.reshape(tile_b, [32, 1]) + + tile_c = pl.row_expand_div(tile_a, tile_b_reshaped) + + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[32, 32], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[32, 32], pl.FP32], + b: pl.Tensor[[1, 32], pl.FP32] + ) -> pl.Tensor[[32, 32], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_64x64(TestRowExpandDivBase): + ROWS = 64 + COLS = 64 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[64, 64], pl.FP32], + b: pl.Tensor[[64, 1], pl.FP32], + c: pl.Tensor[[64, 64], pl.FP32], + ) -> pl.Tensor[[64, 64], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[64, 64]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[64, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[64, 64], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[64, 64], pl.FP32], + b: pl.Tensor[[64, 1], pl.FP32] + ) -> pl.Tensor[[64, 64], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_128x128(TestRowExpandDivBase): + ROWS = 128 + COLS = 128 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[128, 128], pl.FP32], + b: pl.Tensor[[128, 1], pl.FP32], + c: pl.Tensor[[128, 128], pl.FP32], + ) -> pl.Tensor[[128, 128], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[128, 128], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[128, 128], pl.FP32], + b: pl.Tensor[[128, 1], pl.FP32] + ) -> pl.Tensor[[128, 128], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_128x64(TestRowExpandDivBase): + ROWS = 128 + COLS = 64 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[128, 64], pl.FP32], + b: pl.Tensor[[128, 1], pl.FP32], + c: pl.Tensor[[128, 64], pl.FP32], + ) -> pl.Tensor[[128, 64], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 64]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[128, 64], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[128, 64], pl.FP32], + b: pl.Tensor[[128, 1], pl.FP32] + ) -> pl.Tensor[[128, 64], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_64x128(TestRowExpandDivBase): + ROWS = 64 + COLS = 128 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[64, 1], pl.FP32], + c: pl.Tensor[[64, 128], pl.FP32], + ) -> pl.Tensor[[64, 128], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[64, 128]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[64, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[64, 128], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[64, 1], pl.FP32] + ) -> pl.Tensor[[64, 128], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_96x96(TestRowExpandDivBase): + ROWS = 96 + COLS = 96 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[96, 96], pl.FP32], + b: pl.Tensor[[96, 1], pl.FP32], + c: pl.Tensor[[96, 96], pl.FP32], + ) -> pl.Tensor[[96, 96], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[96, 96]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[96, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[96, 96], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[96, 96], pl.FP32], + b: pl.Tensor[[96, 1], pl.FP32] + ) -> pl.Tensor[[96, 96], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_80x96(TestRowExpandDivBase): + ROWS = 80 + COLS = 96 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[80, 96], pl.FP32], + b: pl.Tensor[[80, 1], pl.FP32], + c: pl.Tensor[[80, 96], pl.FP32], + ) -> pl.Tensor[[80, 96], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[80, 96]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[80, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[80, 96], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[80, 96], pl.FP32], + b: pl.Tensor[[80, 1], pl.FP32] + ) -> pl.Tensor[[80, 96], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +class TestRowExpandDiv_96x80(TestRowExpandDivBase): + ROWS = 96 + COLS = 80 + + def get_program(self) -> Any: + import pypto.language as pl + + @pl.program + class RowExpandDivProgram: + @pl.function + def row_expand_div( + self, + a: pl.Tensor[[96, 80], pl.FP32], + b: pl.Tensor[[96, 1], pl.FP32], + c: pl.Tensor[[96, 80], pl.FP32], + ) -> pl.Tensor[[96, 80], pl.FP32]: + tile_a = pl.load(a, offsets=[0, 0], shapes=[96, 80]) + tile_b = pl.load(b, offsets=[0, 0], shapes=[96, 1]) + tile_c = pl.row_expand_div(tile_a, tile_b) + out_c = pl.store(tile_c, offsets=[0, 0], shapes=[96, 80], output_tensor=c) + return out_c + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[96, 80], pl.FP32], + b: pl.Tensor[[96, 1], pl.FP32] + ) -> pl.Tensor[[96, 80], pl.FP32]: + out_c = self.row_expand_div(a, b) + return out_c + + return RowExpandDivProgram + + +# ============================================================================= +# pytest test functions +# ============================================================================= + + +def test_row_expand_div_32x32(test_runner): + """Test 32x32 shape.""" + test_case = TestRowExpandDiv_32x32() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_64x64(test_runner): + """Test 64x64 shape.""" + test_case = TestRowExpandDiv_64x64() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_128x128(test_runner): + """Test 128x128 shape.""" + test_case = TestRowExpandDiv_128x128() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_128x64(test_runner): + """Test 128x64 shape.""" + test_case = TestRowExpandDiv_128x64() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_64x128(test_runner): + """Test 64x128 shape.""" + test_case = TestRowExpandDiv_64x128() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_96x96(test_runner): + """Test 96x96 shape.""" + test_case = TestRowExpandDiv_96x96() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_80x96(test_runner): + """Test 80x96 shape.""" + test_case = TestRowExpandDiv_80x96() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" + + +def test_row_expand_div_96x80(test_runner): + """Test 96x80 shape.""" + test_case = TestRowExpandDiv_96x80() + result = test_runner.run(test_case) + assert result.passed, f"Test failed: {result.error}" From cb570a60e895650b9aefeba3fbc1fbec0cf7d526 Mon Sep 17 00:00:00 2001 From: majin0824 Date: Thu, 12 Feb 2026 15:55:09 +0800 Subject: [PATCH 3/3] =?UTF-8?q?[WIP]fuzz=E4=B8=80=E4=B8=AAkernel=E8=83=BD?= =?UTF-8?q?=E8=B7=91=E9=80=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.log | 122 ++++ src/fuzzer/CHANGELOG.md | 332 ----------- src/fuzzer/OP_RULES.md | 370 ------------ src/fuzzer/README.md | 554 ++++++++++++------ src/fuzzer/example_multi_kernel.py | 294 +++++++--- .../generated_tests/test_fuzz_multi_kernel.py | 67 ++- src/fuzzer/src/kernel_generator.py | 4 +- src/fuzzer/src/multi_kernel_test_generator.py | 195 ++++-- src/pto_test/codegen/golden_generator.py | 2 + 9 files changed, 900 insertions(+), 1040 deletions(-) create mode 100644 run.log delete mode 100644 src/fuzzer/CHANGELOG.md delete mode 100644 src/fuzzer/OP_RULES.md diff --git a/run.log b/run.log new file mode 100644 index 0000000..f1e6fe9 --- /dev/null +++ b/run.log @@ -0,0 +1,122 @@ +============================= test session starts ============================== +platform linux -- Python 3.10.19, pytest-9.0.2, pluggy-1.6.0 -- /data/m00956180/.conda/envs/mjzkd/bin/python3.10 +cachedir: .pytest_cache +rootdir: /data/m00956180/runtime/ptofuzz +configfile: pyproject.toml +plugins: forked-1.6.0, repeat-0.9.4 +collecting ... collected 1 item + +src/fuzzer/generated_tests/test_fuzz_multi_kernel.py::TestMultiKernelFuzzing::test_fuzz_sequential_simple 2026-02-12 15:53:42.801 D | Created 1 basic blocks +2026-02-12 15:53:42.801 I | Identified 1 basic blocks +2026-02-12 15:53:42.801 D | Found 6 dependencies in block 0 +2026-02-12 15:53:42.801 I | Found 6 dependency edges +2026-02-12 15:53:42.801 I | Dependency types: RAW=6, WAR=0, WAW=0 +2026-02-12 15:53:42.801 D | Assigned declaration order to 8 statements +2026-02-12 15:53:42.801 D | Variable tile_a has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 3 +2026-02-12 15:53:42.801 D | Lifetime for tile_a: [0, 3] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tile_b has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 2 +2026-02-12 15:53:42.801 D | Lifetime for tile_b: [1, 2] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_0 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 5 +2026-02-12 15:53:42.801 D | Lifetime for tmp_0: [2, 5] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_1 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 4 +2026-02-12 15:53:42.801 D | Lifetime for tmp_1: [3, 4] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_2 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 5 +2026-02-12 15:53:42.801 D | Lifetime for tmp_2: [4, 5] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_3 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 6 +2026-02-12 15:53:42.801 D | Lifetime for tmp_3: [5, 6] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_1 can reuse tile_b (lifetime [3, 4] vs [1, 2]) +2026-02-12 15:53:42.801 D | Variable tmp_2 can reuse tile_a (lifetime [4, 5] vs [0, 3]) +2026-02-12 15:53:42.801 D | Variable tmp_3 cannot reuse tile_a due to overlap with existing user tmp_2 (lifetime [5, 6] vs [4, 5]) +2026-02-12 15:53:42.801 D | Variable tmp_3 can reuse tile_b (lifetime [5, 6] vs [1, 2]) +2026-02-12 15:53:42.801 D | Created 1 basic blocks +2026-02-12 15:53:42.801 I | Identified 1 basic blocks +2026-02-12 15:53:42.801 D | Found 8 dependencies in block 0 +2026-02-12 15:53:42.801 I | Found 8 dependency edges +2026-02-12 15:53:42.801 I | Dependency types: RAW=8, WAR=0, WAW=0 +2026-02-12 15:53:42.801 D | Assigned declaration order to 9 statements +2026-02-12 15:53:42.801 D | Variable tile_a has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 2 +2026-02-12 15:53:42.801 D | Lifetime for tile_a: [0, 2] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tile_b has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 2 +2026-02-12 15:53:42.801 D | Lifetime for tile_b: [1, 2] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_0 has 2 use statements +2026-02-12 15:53:42.801 D | Use at order 3 +2026-02-12 15:53:42.801 D | Use at order 5 +2026-02-12 15:53:42.801 D | Lifetime for tmp_0: [2, 5] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_1 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 4 +2026-02-12 15:53:42.801 D | Lifetime for tmp_1: [3, 4] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_2 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 6 +2026-02-12 15:53:42.801 D | Lifetime for tmp_2: [4, 6] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_3 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 6 +2026-02-12 15:53:42.801 D | Lifetime for tmp_3: [5, 6] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_4 has 1 use statements +2026-02-12 15:53:42.801 D | Use at order 7 +2026-02-12 15:53:42.801 D | Lifetime for tmp_4: [6, 7] space=1 size=65536 +2026-02-12 15:53:42.801 D | Variable tmp_1 can reuse tile_a (lifetime [3, 4] vs [0, 2]) +2026-02-12 15:53:42.801 D | Variable tmp_2 cannot reuse tile_a due to overlap with existing user tmp_1 (lifetime [4, 6] vs [3, 4]) +2026-02-12 15:53:42.801 D | Variable tmp_2 can reuse tile_b (lifetime [4, 6] vs [1, 2]) +2026-02-12 15:53:42.801 D | Variable tmp_3 can reuse tile_a (lifetime [5, 6] vs [0, 2]) +2026-02-12 15:53:42.801 D | Variable tmp_4 cannot reuse tile_a due to overlap with existing user tmp_3 (lifetime [6, 7] vs [5, 6]) +2026-02-12 15:53:42.802 D | Variable tmp_4 cannot reuse tile_b due to overlap with existing user tmp_2 (lifetime [6, 7] vs [4, 6]) +2026-02-12 15:53:42.802 D | Variable tmp_4 can reuse tmp_0 (lifetime [6, 7] vs [2, 5]) +2026-02-12 15:53:42.802 D | Created 1 basic blocks +2026-02-12 15:53:42.802 I | Identified 1 basic blocks +2026-02-12 15:53:42.802 D | Found 1 dependencies in block 0 +2026-02-12 15:53:42.802 I | Found 1 dependency edges +2026-02-12 15:53:42.802 I | Dependency types: RAW=1, WAR=0, WAW=0 +2026-02-12 15:53:42.802 D | Assigned declaration order to 3 statements +2026-02-12 15:53:42.802 W | No TileType variables found, skipping memory reuse +2026-02-12 15:53:42.803 D | Created 1 basic blocks +2026-02-12 15:53:42.803 D | Found 1 dependencies in block 0 +[INFO] ensure_device_set: DeviceRunner: device=9 set, streams created +[INFO] init_runtime_impl: Registering 2 kernel(s) in init_runtime_impl +[INFO] init_runtime_impl: Loaded orchestration function: BuildOrchestrator +[INFO] init_runtime_impl: === Calling Orchestration Function === +[INFO] init_runtime_impl: Runtime initialized. Ready for execution from Python. +[INFO] ensure_binaries_loaded: DeviceRunner: binaries loaded + +=== Initialize runtime args === + +=== launch_aicpu_kernel DynTileFwkKernelServerInit=== + +=== launch_aicpu_kernel DynTileFwkKernelServer=== + +=== launch_aicore_kernel=== + +=== rtStreamSynchronize stream_aicpu_=== + +=== rtStreamSynchronize stream_aicore_=== +[INFO] validate_runtime_impl: === Copying Results Back to Host === +[INFO] validate_runtime_impl: === Cleaning Up === +[INFO] validate_runtime_impl: Freed 1 device tensors +[INFO] validate_runtime_impl: === Finalize Complete === +PASSED + +=============================== warnings summary =============================== +../../.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58 +../../.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58 + /data/m00956180/.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58: UserWarning: Warning: The /usr/local/Ascend/cann-8.5.0 owner does not match the current owner. + warnings.warn(f"Warning: The {path} owner does not match the current owner.") + +../../.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58 +../../.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58 + /data/m00956180/.conda/envs/mjzkd/lib/python3.10/site-packages/torch_npu/utils/collect_env.py:58: UserWarning: Warning: The /usr/local/Ascend/cann-8.5.0/aarch64-linux/ascend_toolkit_install.info owner does not match the current owner. + warnings.warn(f"Warning: The {path} owner does not match the current owner.") + +src/fuzzer/generated_tests/test_fuzz_multi_kernel.py:24 + /data/m00956180/runtime/ptofuzz/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py:24: PytestCollectionWarning: cannot collect test class 'TestFuzzSequentialSimple' because it has a __init__ constructor (from: src/fuzzer/generated_tests/test_fuzz_multi_kernel.py) + class TestFuzzSequentialSimple(PTOTestCase): + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +======================== 1 passed, 5 warnings in 12.83s ======================== +[INFO] finalize: DeviceRunner finalized diff --git a/src/fuzzer/CHANGELOG.md b/src/fuzzer/CHANGELOG.md deleted file mode 100644 index c6cc5f2..0000000 --- a/src/fuzzer/CHANGELOG.md +++ /dev/null @@ -1,332 +0,0 @@ -# Fuzzer 框架更新日志 - -## 2026-02-11 - 输入形状一致性修复 - -### Bug 修复 - -**修复形状不一致导致的 NumPy 广播错误** - -**问题**: 测试用例配置中使用了不同维度的输入形状,导致 NumPy 参考实现中出现广播错误 -```python -# ✗ 错误:不同维度的输入 -"input_shapes_list": [ - [(128, 128), (64, 64)], # kernel_0: 不同维度 - [(128, 128), (128, 128), (96, 96)], # kernel_1: 混合维度 -] - -# NumPy 计算时报错: -# ValueError: operands could not be broadcast together with shapes (128,128) (96,96) -``` - -**修复后**: -```python -# ✓ 正确:所有输入使用相同维度 -"input_shapes_list": [ - [(128, 128), (128, 128)], # kernel_0: 统一维度 - [(128, 128), (128, 128), (128, 128)], # kernel_1: 统一维度 -] -``` - -**影响范围**: -- **src/fuzzer/example_multi_kernel.py**: - - `fuzz_sequential_simple`: 所有输入改为 128x128 - - `fuzz_branching_parallel`: 所有输入改为 128x128 - - `fuzz_branching_wide`: 所有输入改为 128x128 - -**根本原因**: -- 当内核中有操作涉及不同形状的输入时(如 96x96 和 128x128),会导致 NumPy 广播失败 -- 虽然 PyPTO IR 代码生成时使用了正确的 load 形状,但运算过程中仍会出现形状不匹配 - -**设计决策**: -- 简化测试用例配置,统一使用相同形状的输入 -- 避免在计算过程中处理复杂的形状变换逻辑 -- 确保 NumPy 参考实现和 PyPTO IR 代码行为一致 - -**症状**: `ValueError: operands could not be broadcast together with shapes (128,128) (96,96)` - ---- - -## 2026-02-11 - NumPy 嵌套函数修复 - -### Bug 修复 - -**修复 compute_expected 中嵌套函数的 self 参数问题** - -**问题**: 生成的 NumPy 参考实现函数包含了错误的 `self` 参数 -```python -def compute_expected(self, tensors, params=None): - def _numpy_kernel_0(self, a, b): # ✗ 错误:嵌套函数不应该有 self - ... - result_0 = self._numpy_kernel_0(...) # ✗ 错误调用方式 -``` - -**修复后**: -```python -def compute_expected(self, tensors, params=None): - def _numpy_kernel_0(a, b): # ✓ 正确:嵌套函数不需要 self - ... - result_0 = _numpy_kernel_0(...) # ✓ 正确:直接调用 -``` - -**影响范围**: -- **src/fuzzer/src/multi_kernel_test_generator.py**: - - `_generate_numpy_reference()`: 移除嵌套函数的 `self` 参数(第281行) - - `_generate_test_class()`: 所有调用改为直接调用而不使用 `self.`(第532、546、574、599行) - -**症状**: `NameError: name 'self' is not defined` - ---- - -## 2026-02-11 - 形状大小限制 - -### 性能优化 - -**限制最大形状尺寸**: 避免内存溢出,将最大形状从 256x256 限制到 128x128 - -**变更内容**: -1. **fuzzer.py**: - - `get_aligned_shapes()`: 添加 `max_size` 参数,默认 128 - - `generate_aligned_shape()`: 默认 `max_size` 改为 128 - - 常用行数列表从 `[32, 64, 80, 96, 128, 160, 192, 224, 256]` 改为 `[32, 64, 80, 96, 128]` - -2. **example_multi_kernel.py**: - - 所有 256x256 形状改为 96x96 - - 示例配置使用更小、更安全的形状组合 - -**原因**: -- 避免超过硬件内存限制 -- 提高测试执行速度 -- 减少内存分配失败的风险 - -**影响**: -- 生成的测试用例形状范围: 32x32 到 128x128 -- 对齐的列数: 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128 - ---- - -## 2026-02-11 - Orchestrator 模式修正 - -### 架构变更 (Breaking Change) - -**Orchestrator 不再显式创建 tensor**: 修正 Orchestration 函数以匹配 PyPTO 框架的正确模式 - -**变更前** (错误): -```python -@pl.function(type=pl.FunctionType.Orchestration) -def orchestrator(self, a: ..., b: ...) -> ...: - # ✗ 错误: 不应该显式创建 tensor - tmp_0 = pl.tensor.create([128, 128], pl.FP32) - tmp_1 = pl.tensor.create([128, 128], pl.FP32) - - tmp_0 = self.kernel_0(a, b, tmp_0) - tmp_1 = self.kernel_1(tmp_0, b, tmp_1) - return tmp_1 -``` - -**变更后** (正确): -```python -@pl.function(type=pl.FunctionType.Orchestration) -def orchestrator(self, a: ..., b: ...) -> ...: - # ✓ 正确: 框架自动管理输出 tensor - result_0 = self.kernel_0(a, b) - result_1 = self.kernel_1(result_0, b) - return result_1 -``` - -**关键区别**: -1. **Orchestration 函数不创建 tensor**: 移除所有 `pl.tensor.create()` 调用 -2. **调用 InCore 函数时只传入输入参数**: 不需要传递输出 tensor -3. **框架自动管理输出**: PyPTO 框架会自动分配和管理 InCore 函数的输出 tensor - -**InCore 函数签名保持不变**: -```python -@pl.function(type=pl.FunctionType.InCore) -def kernel_0(self, a: ..., b: ..., output: ...) -> ...: - # InCore 函数仍然需要 output 参数 - tile_a = pl.load(a, ...) - result = pl.store(tile_result, ..., output_tensor=output) - return result -``` - -### 影响范围 - -- **src/fuzzer/src/orchestrator_generator.py**: - - `generate_sequential()`: 移除 tensor 创建,简化 kernel 调用 - - `generate_branching()`: 已经正确,无需修改 - - `generate_mixed()`: 已经正确,无需修改 - - `generate_merge_kernel()`: 移除对齐验证(仍保留 output 参数) - -### 参考 - -- [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py): Orchestration 模式参考 -- [tests/test_cases/test_matmul.py](../../tests/test_cases/test_matmul.py): Orchestration 模式参考 - ---- - -## 2026-02-11 - 形状对齐约束和验证 - -### 新增功能 - -1. **32字节对齐约束** (fuzzer.py) - - 添加 `is_shape_aligned()` 函数验证形状是否满足32字节对齐 - - 添加 `get_aligned_shapes()` 函数获取所有对齐的常用形状 - - 添加 `generate_aligned_shape()` 函数随机生成对齐的形状 - - 支持多种数据类型: FP32, FP16, INT32, INT8 - -2. **自动形状验证** (kernel_generator.py) - - `generate_kernel()` 自动验证输入输出形状 - - 检测到不对齐的形状时自动生成对齐的替代形状 - - 打印警告信息提示形状不对齐 - -3. **Orchestrator 形状验证** (orchestrator_generator.py) - - 在创建临时 tensor 时验证形状对齐 - - 在 merge_kernel 生成时验证形状对齐 - - 打印警告信息提示不对齐的形状 - -4. **文档更新** (OP_RULES.md) - - 新增第 0 节: 形状对齐约束 - - 详细说明32字节对齐规则 - - 提供对齐和不对齐的形状示例 - - 说明 Fuzzer 中的对齐验证工具 - -### 对齐规则 - -**核心约束**: -- 形状的尾轴(列数)必须满足: `cols == 1` 或 `(cols * sizeof(dtype)) % 32 == 0` - -**FP32 类型的有效尾轴值**: -- 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, ..., 128, ... - -**示例**: -```python -# ✓ 有效 -(128, 1) # 尾轴=1 -(128, 8) # 8*4=32 -(128, 64) # 64*4=256 -(128, 128) # 128*4=512 - -# ✗ 无效 -(128, 3) # 3*4=12, 不对齐 -(128, 5) # 5*4=20, 不对齐 -(128, 10) # 10*4=40, 不对齐 -``` - -### 影响范围 - -- **src/fuzzer/src/fuzzer.py**: 新增对齐验证工具函数 -- **src/fuzzer/src/kernel_generator.py**: 导入并使用对齐验证 -- **src/fuzzer/src/orchestrator_generator.py**: 导入并使用对齐验证 -- **src/fuzzer/OP_RULES.md**: 新增第 0 节文档 - -### 向后兼容性 - -- 现有代码如果使用不对齐的形状,会自动修正并打印警告 -- 不会导致生成失败,而是自动选择最接近的对齐形状 -- 建议手动检查生成的代码,确保形状符合预期 - ---- - -## 2026-02-11 - API 简化和算子扩展 - -### API 变更 (Breaking Change) - -**简化 PyPTO API 调用**: 将 `pl.op.block.xxx` 简化为 `pl.xxx` - -**变更前**: -```python -tile_a = pl.op.block.load(a, 0, 0, 128, 128) -tmp_0 = pl.op.block.add(tile_a, tile_b) -result = pl.op.block.relu(tmp_0) -``` - -**变更后**: -```python -tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) -tmp_0 = pl.add(tile_a, tile_b) -result = pl.relu(tmp_0) -``` - -**影响范围**: -- `kernel_generator.py`: 内核代码生成 -- `multi_kernel_test_generator.py`: 测试类代码生成 -- `orchestrator_generator.py`: 合并内核生成 -- `OP_RULES.md`: 文档示例 -- `README.md`: 文档示例 - -### 新增功能 - -1. **扩展算子支持** (fuzzer.py) - - 新增一元算子: `log`, `abs`, `relu` - - 新增二元算子: `minimum` - - 新增高级算子组: - - Row expand 系列: `row_expand_add`, `row_expand_sub`, `row_expand_mul`, `row_expand_div` - - Matrix 算子: `matmul` - -2. **高级算子开关** - - 添加 `enable_advanced_ops` 参数到所有生成器类 - - 基础模式: 使用标准算子 (add, mul, sqrt, exp等) - - 高级模式: 额外包含 row_expand 和 matmul 算子 - -3. **算子规则文档** ([OP_RULES.md](OP_RULES.md)) - - 完整的算子分类和定义 - - 每个算子的形状约束说明 - - 常见算子组合模式 (Softmax, LayerNorm, GELU, ReLU变体等) - - 禁止的算子组合和约束处理 - - Fuzzer 生成策略建议 - -### 修改文件 - -1. **src/fuzzer.py** - - 扩展 `BLOCK_UNARY_OPS`: 新增 log, abs, relu - - 扩展 `BLOCK_BINARY_OPS`: 新增 minimum - - 新增 `BLOCK_ROW_EXPAND_OPS`: row_expand_* 系列 - - 新增 `BLOCK_MATRIX_OPS`: matmul - - 添加 `enable_advanced_ops` 参数 - - 简化 row_expand 操作的输入类型定义 - -2. **src/kernel_generator.py** - - 添加 `enable_advanced_ops` 参数支持 - - 传递高级算子开关到 OpFuzzer - -3. **src/multi_kernel_test_generator.py** - - 添加 `enable_advanced_ops` 参数支持 - - 更新 `_get_numpy_operation` 方法支持所有新算子: - - log, abs, relu, minimum - - row_expand_add, row_expand_sub, row_expand_mul, row_expand_div - - matmul - -4. **example_multi_kernel.py** - - 添加 `--enable-advanced-ops` 命令行参数 - - 在输出中显示是否启用高级算子 - -5. **README.md** - - 更新快速开始部分,区分基础和高级示例 - - 添加高级算子使用说明 - - 添加对 OP_RULES.md 的引用 - - 更新算子列表和约束说明 - -### 使用方法 - -#### 基础模式 (默认) -```bash -python src/fuzzer/example_multi_kernel.py --num-cases 3 -``` -使用算子: add, sub, mul, div, maximum, minimum, adds, subs, muls, divs, sqrt, rsqrt, exp, neg, recip, log, abs, relu - -#### 高级模式 -```bash -python src/fuzzer/example_multi_kernel.py --num-cases 3 --enable-advanced-ops -``` -额外包含: row_expand_add, row_expand_sub, row_expand_mul, row_expand_div, matmul - -### 算子约束 - -- **avoid_zero**: div, divs, recip, row_expand_div -- **positive_only**: sqrt, rsqrt, log -- **row_vec_required**: row_expand_* 系列 (第二个输入需要 [M,1] 形状) - -### 参考文档 - -- [OP_RULES.md](OP_RULES.md) - 完整的算子规则和组合模式 -- [README.md](README.md) - 框架使用文档 -- [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py) - row_expand 使用示例 diff --git a/src/fuzzer/OP_RULES.md b/src/fuzzer/OP_RULES.md deleted file mode 100644 index a1bac79..0000000 --- a/src/fuzzer/OP_RULES.md +++ /dev/null @@ -1,370 +0,0 @@ -# PyPTO 算子组合规则 (Op Combination Rules) - -本文档定义了 PyPTO IR 中所有支持的算子及其组合规则,用于指导 fuzzer 生成合法的算子组合。 - -## 0. 形状对齐约束 (Shape Alignment Constraints) - -### 0.1 32字节对齐规则 - -**重要**: 所有 tensor 创建和 reshape 操作必须满足 32 字节对齐约束。 - -**规则**: -- 形状的**尾轴**(最后一个维度,即列数)必须满足以下条件之一: - 1. 尾轴 = 1, 或者 - 2. (尾轴 × sizeof(datatype)) % 32 == 0 - -**数据类型大小**: -- FP32: 4 字节 -- FP16: 2 字节 -- INT32: 4 字节 -- INT8: 1 字节 - -**FP32 类型的有效尾轴值**: -- 尾轴 = 1 (总是有效) -- 尾轴 % 8 == 0 (因为 8 × 4 = 32) -- 有效值: 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, ... - -**示例 (FP32)**: -```python -# ✓ 有效的形状 -pl.tensor.create([128, 1], pl.FP32) # 尾轴=1 -pl.tensor.create([128, 8], pl.FP32) # 8*4=32, 对齐 -pl.tensor.create([128, 16], pl.FP32) # 16*4=64, 对齐 -pl.tensor.create([128, 32], pl.FP32) # 32*4=128, 对齐 -pl.tensor.create([128, 64], pl.FP32) # 64*4=256, 对齐 -pl.tensor.create([128, 128], pl.FP32) # 128*4=512, 对齐 - -# ✗ 无效的形状 -pl.tensor.create([128, 3], pl.FP32) # 3*4=12, 不对齐 -pl.tensor.create([128, 5], pl.FP32) # 5*4=20, 不对齐 -pl.tensor.create([128, 7], pl.FP32) # 7*4=28, 不对齐 -pl.tensor.create([128, 10], pl.FP32) # 10*4=40, 不对齐 (40 % 32 = 8) -``` - -**Reshape 约束**: -```python -# 示例: reshape 操作也必须满足对齐约束 -tile_tmp = pl.create_tile([8, 1], dtype=pl.FP32, target_memory=1) # ✓ 尾轴=1 -tile_reshaped = pl.reshape(tile_tmp, [1, 8]) # ✓ 尾轴=8, 8*4=32 - -# ✗ 错误示例 -tile_bad = pl.reshape(tile_tmp, [2, 4]) # ✗ 尾轴=4, 4*4=16, 不对齐 -``` - -### 0.2 Fuzzer 中的对齐验证 - -Fuzzer 框架提供以下工具函数: - -```python -from src.fuzzer.src.fuzzer import is_shape_aligned, generate_aligned_shape, get_aligned_shapes - -# 检查形状是否对齐 -is_valid = is_shape_aligned((128, 64), dtype="FP32") # True -is_valid = is_shape_aligned((128, 5), dtype="FP32") # False - -# 生成随机的对齐形状 -shape = generate_aligned_shape(rng, dtype="FP32", max_size=256) - -# 获取所有常用的对齐形状列表 -all_shapes = get_aligned_shapes(dtype="FP32") -``` - -**Fuzzer 自动处理**: -- `KernelGenerator.generate_kernel()` 会自动验证并修正输入/输出形状 -- `OrchestratorGenerator` 会验证所有临时 tensor 的形状 -- 如果检测到不对齐的形状,会打印警告并自动生成对齐的形状 - -## 1. 算子分类 (Operator Categories) - -### 1.1 Block Memory Operations (内存操作) - -| 算子名 | 输入类型 | 输出类型 | 参数 | 约束 | -|--------|----------|----------|------|------| -| `block.load` | `tensor` | `tile` | `offsets: [int, int]`, `shapes: [int, int]`, `target_memory: int` | target_memory ∈ {1, 2} (UB, L1) | -| `block.store` | `tile` | `tensor` | `offsets: [int, int]`, `shapes: [int, int]`, `output_tensor: tensor` | - | -| `block.l0c_store` | `tile` | `tensor` | `offsets: [int, int]`, `shapes: [int, int]`, `output_tensor: tensor` | - | -| `block.move` | `tile` | `tile` | `target_memory: int`, `transpose: bool` | target_memory ∈ {1, 2, 3, 4} | -| `block.create_tile` | - | `tile` | `shape: [int, int]`, `dtype: DataType`, `target_memory: int` | - | -| `block.full` | - | `tile` | `shape: [int, int]`, `dtype: DataType`, `value: float` | 创建填充值的tile | - -### 1.2 Block Element-wise Binary Operations (逐元素二元操作) - -| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | -|--------|----------|----------|----------|-----------| -| `block.add` | `tile, tile` | `tile` | 支持广播 | `a + b` | -| `block.sub` | `tile, tile` | `tile` | 支持广播 | `a - b` | -| `block.mul` | `tile, tile` | `tile` | 支持广播 | `a * b` | -| `block.div` | `tile, tile` | `tile` | 支持广播,避免除零 | `a / b` | -| `block.maximum` | `tile, tile` | `tile` | 支持广播 | `np.maximum(a, b)` | -| `block.minimum` | `tile, tile` | `tile` | 支持广播 | `np.minimum(a, b)` | -| `block.cmp` | `tile, tile` | `tile` | 支持广播 | 比较操作,cmp_type: 0=EQ, 1=NE, 2=LT, 3=LE, 4=GT, 5=GE | - -### 1.3 Block Scalar Operations (标量操作) - -| 算子名 | 输入类型 | 输出类型 | NumPy等价 | -|--------|----------|----------|-----------| -| `block.adds` | `tile, scalar` | `tile` | `a + s` | -| `block.subs` | `tile, scalar` | `tile` | `a - s` | -| `block.muls` | `tile, scalar` | `tile` | `a * s` | -| `block.divs` | `tile, scalar` | `tile` | `a / s` (避免除零) | -| `block.cmps` | `tile, scalar` | `tile` | 比较操作 | - -### 1.4 Block Unary Operations (一元操作) - -| 算子名 | 输入类型 | 输出类型 | 约束 | NumPy等价 | -|--------|----------|----------|------|-----------| -| `block.neg` | `tile` | `tile` | - | `-a` | -| `block.exp` | `tile` | `tile` | 建议输入范围 [-10, 10] | `np.exp(a)` | -| `block.recip` | `tile` | `tile` | 避免除零 | `1.0 / a` | -| `block.sqrt` | `tile` | `tile` | 输入必须 ≥ 0 | `np.sqrt(a)` | -| `block.rsqrt` | `tile` | `tile` | 输入必须 > 0 | `1.0 / np.sqrt(a)` | -| `block.log` | `tile` | `tile` | 输入必须 > 0 | `np.log(a)` | -| `block.abs` | `tile` | `tile` | - | `np.abs(a)` | -| `block.relu` | `tile` | `tile` | - | `np.maximum(0, a)` | -| `block.cast` | `tile` | `tile` | 参数: `target_dtype: DataType`, `mode: int` | 类型转换 | - -### 1.5 Block Matrix Operations (矩阵操作) - -| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | -|--------|----------|----------|----------|-----------| -| `block.matmul` | `tile, tile` | `tile` | `[M, K] @ [K, N] -> [M, N]` | `a @ b` | -| `block.matmul_acc` | `tile, tile, tile` | `tile` | `acc + (lhs @ rhs)` | `acc + a @ b` | - -### 1.6 Block Row/Column Broadcast Operations (行列广播操作) - -**重要**: 这些操作用于处理向量与矩阵的广播运算。 - -| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | -|--------|----------|----------|----------|-----------| -| `block.row_expand_add` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile + row_vec` | -| `block.row_expand_sub` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile - row_vec` | -| `block.row_expand_mul` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行 | `tile * row_vec` | -| `block.row_expand_div` | `tile[M,N], tile[M,1]` | `tile[M,N]` | row_vec广播到每行,避免除零 | `tile / row_vec` | -| `block.col_expand` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | 列向量扩展 | -| `block.col_expand_mul` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | `tile * col_vec` | -| `block.col_expand_div` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列,避免除零 | `tile / col_vec` | -| `block.col_expand_sub` | `tile[M,N], tile[1,N]` | `tile[M,N]` | col_vec广播到每列 | `tile - col_vec` | -| `block.expands` | `tile[M,N], scalar` | `tile[M,N]` | 标量广播到tile形状 | 标量扩展 | - -### 1.7 Block Reduction Operations (归约操作) - -| 算子名 | 输入类型 | 输出类型 | 参数 | 形状变换 | NumPy等价 | -|--------|----------|----------|------|----------|-----------| -| `block.sum` | `tile` | `tile` | `axis: int`, `keepdim: bool` | axis=1, keepdim=True: [M,N]->[M,1] | `np.sum(a, axis=axis, keepdims=keepdim)` | -| `block.max` | `tile` | `tile` | `axis: int`, `keepdim: bool` | 同上 | `np.max(a, axis=axis, keepdims=keepdim)` | -| `block.min` | `tile` | `tile` | `axis: int`, `keepdim: bool` | 同上 | `np.min(a, axis=axis, keepdims=keepdim)` | -| `block.row_sum` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.sum(a, axis=1, keepdims=True)` | -| `block.row_max` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.max(a, axis=1, keepdims=True)` | -| `block.row_min` | `tile, tile` | `tile` | 需要临时tile | [M,N] -> [M,1] | `np.min(a, axis=1, keepdims=True)` | - -### 1.8 Block Transform Operations (变换操作) - -| 算子名 | 输入类型 | 输出类型 | 参数 | 形状变换 | -|--------|----------|----------|------|----------| -| `block.reshape` | `tile` | `tile` | `shape: [int, int]` | 重塑形状 | -| `block.transpose` | `tile` | `tile` | `axis1: int`, `axis2: int` | 交换维度 | -| `block.view` | `tile` | `tile` | `shape: [int, int]`, `offset: [int, int]` | 创建视图 | - -### 1.9 Tensor-level Operations (Tensor级别操作) - -| 算子名 | 输入类型 | 输出类型 | 说明 | -|--------|----------|----------|------| -| `tensor.create` | - | `tensor` | 创建tensor | -| `tensor.view` | `tensor` | `tensor` | 创建tensor视图 | -| `tensor.matmul` | `tensor, tensor` | `tensor` | tensor级矩阵乘法 | -| `tensor.mul` | `tensor, tensor/scalar` | `tensor` | tensor级乘法 | -| `tensor.add` | `tensor, tensor/scalar` | `tensor` | tensor级加法 | -| `tensor.sub` | `tensor, tensor/scalar` | `tensor` | tensor级减法 | -| `tensor.div` | `tensor, tensor/scalar` | `tensor` | tensor级除法 | - -## 2. 算子组合规则 (Combination Rules) - -### 2.1 基本组合规则 - -1. **类型匹配**: 操作符的输入类型必须匹配 - - `tile` 操作符接受 `tile` 类型 - - `tensor` 操作符接受 `tensor` 类型 - - 不能混用 - -2. **形状兼容性**: - - 二元操作支持广播:`[M,N] op [M,N]`, `[M,N] op [M,1]`, `[M,N] op [1,N]` - - Row expand 操作: 第二个输入必须是 `[M,1]` 形状 - - Col expand 操作: 第二个输入必须是 `[1,N]` 形状 - - Matmul: `[M,K] @ [K,N] -> [M,N]` - -3. **数据约束**: - - **避免除零**: `div`, `divs`, `recip`, `row_expand_div`, `col_expand_div` - - 确保分母绝对值 ≥ 0.01 - - **正值约束**: `sqrt`, `rsqrt`, `log` - - 确保输入 > 0 或使用 `abs(x) + 1e-6` - - **范围约束**: `exp` - - 建议输入范围 [-10, 10] 避免溢出 - -### 2.2 常见算子组合模式 - -#### 模式1: Softmax 组件 -```python -# Step 1: Row max reduction -max_vals = pl.row_max(tile, tmp_tile) # [M,N] -> [M,1] - -# Step 2: Subtract max (数值稳定性) -centered = pl.row_expand_sub(tile, max_vals) # [M,N] - [M,1] -> [M,N] - -# Step 3: Exponential -exp_vals = pl.exp(centered) # [M,N] -> [M,N] - -# Step 4: Row sum -sum_vals = pl.row_sum(exp_vals, tmp_tile) # [M,N] -> [M,1] - -# Step 5: Normalize -output = pl.row_expand_div(exp_vals, sum_vals) # [M,N] / [M,1] -> [M,N] -``` - -#### 模式2: Layer Normalization 组件 -```python -# Step 1: Row mean (使用 sum + divs) -row_sum = pl.row_sum(tile, tmp_tile) # [M,N] -> [M,1] -row_mean = pl.divs(row_sum, N) # [M,1] / scalar -> [M,1] - -# Step 2: Subtract mean -centered = pl.row_expand_sub(tile, row_mean) # [M,N] - [M,1] -> [M,N] - -# Step 3: Squared -squared = pl.mul(centered, centered) # [M,N] * [M,N] -> [M,N] - -# Step 4: Variance -var_sum = pl.row_sum(squared, tmp_tile) # [M,N] -> [M,1] -variance = pl.divs(var_sum, N) # [M,1] / scalar -> [M,1] - -# Step 5: Inverse std -inv_std = pl.rsqrt(variance) # [M,1] -> [M,1] - -# Step 6: Normalize -output = pl.row_expand_mul(centered, inv_std) # [M,N] * [M,1] -> [M,N] -``` - -#### 模式3: GELU 近似 -```python -# GELU(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3))) -# 简化版本: 使用 sigmoid 近似 -# GELU(x) ≈ x * sigmoid(1.702 * x) - -# Step 1: Scale -scaled = pl.muls(tile, 1.702) # [M,N] * scalar -> [M,N] - -# Step 2: Sigmoid approximation (使用 exp) -neg_scaled = pl.neg(scaled) # -[M,N] -exp_neg = pl.exp(neg_scaled) # exp(-scaled) -one_plus_exp = pl.adds(exp_neg, 1.0) # 1 + exp(-scaled) -sigmoid = pl.recip(one_plus_exp) # 1 / (1 + exp(-scaled)) - -# Step 3: Multiply -output = pl.mul(tile, sigmoid) # [M,N] * [M,N] -> [M,N] -``` - -#### 模式4: ReLU 及变体 -```python -# ReLU -output = pl.relu(tile) - -# LeakyReLU (alpha=0.01) -neg_part = pl.muls(tile, 0.01) # 负半部分 -output = pl.maximum(tile, neg_part) # max(x, 0.01*x) - -# ELU (alpha=1.0) - 简化版 -# ELU(x) = x if x > 0 else alpha * (exp(x) - 1) -zeros = pl.expands(tile, 0.0) -pos_mask = pl.maximum(tile, zeros) # 正半部分 -exp_x = pl.exp(tile) # exp(x) -exp_minus_1 = pl.subs(exp_x, 1.0) # exp(x) - 1 -# 需要 select 操作来完整实现 -``` - -### 2.3 禁止的算子组合 - -1. **类型混用**: - ```python - # ✗ 错误: 不能直接对 tensor 使用 block 操作 - tile_result = pl.add(tensor_a, tensor_b) - - # ✓ 正确: 先 load 到 tile - tile_a = pl.load(tensor_a, offsets=[0, 0], shapes=[M, N]) - tile_b = pl.load(tensor_b, offsets=[0, 0], shapes=[M, N]) - tile_result = pl.add(tile_a, tile_b) - ``` - -2. **形状不匹配**: - ```python - # ✗ 错误: row_expand 操作需要 [M,1] 形状 - tile_a = [128, 128] - tile_b = [128, 64] # 错误形状 - result = pl.row_expand_div(tile_a, tile_b) - - # ✓ 正确: 使用 reshape 或正确的 load 形状 - tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 1]) # [128,1] - result = pl.row_expand_div(tile_a, tile_b) - ``` - -3. **未处理的数值约束**: - ```python - # ✗ 错误: 可能除零 - result = pl.div(tile_a, tile_b) - - # ✓ 正确: 确保分母不为零 - tile_b_safe = pl.maximum(tile_b, pl.expands(tile_b, 0.01)) - result = pl.div(tile_a, tile_b_safe) - ``` - -## 3. Fuzzer 生成策略 - -### 3.1 操作符选择权重 - -基于实际硬件支持和测试价值,建议权重: - -- **高频操作** (权重 10): `add`, `mul`, `sub`, `maximum`, `adds`, `muls` -- **中频操作** (权重 5): `div`, `sqrt`, `exp`, `row_expand_*`, `matmul` -- **低频操作** (权重 2): `rsqrt`, `log`, `recip`, `transpose`, `reshape` -- **特殊操作** (权重 1): `cast`, `cmp`, reduction 操作 - -### 3.2 形状生成策略 - -支持的形状规格: -- **标准方形**: 32x32, 64x64, 96x96, 128x128, 256x256 -- **长方形**: 64x128, 128x64, 80x96, 96x80, 128x256 -- **向量形状**: Nx1, 1xN (用于 row/col expand) - -### 3.3 操作链生成规则 - -1. **长度范围**: 3-10 个操作 -2. **变量重用**: 每个中间结果至少使用一次 -3. **输入使用**: 所有输入必须至少被使用一次 -4. **类型一致性**: 操作链内保持 tile 类型 -5. **形状追踪**: 追踪每个变量的形状以确保兼容性 - -### 3.4 测试用例模板 - -```python -@pl.function(type=pl.FunctionType.InCore) -def kernel_func(self, a: pl.Tensor[[M, N], pl.FP32], - b: pl.Tensor[[M, 1], pl.FP32]) -> pl.Tensor[[M, N], pl.FP32]: - # Load tiles - tile_a = pl.load(a, offsets=[0, 0], shapes=[M, N]) - tile_b = pl.load(b, offsets=[0, 0], shapes=[M, 1]) - - # Operation chain (fuzzer generated) - tmp_0 = pl.row_expand_div(tile_a, tile_b) - tmp_1 = pl.sqrt(tmp_0) - tmp_2 = pl.muls(tmp_1, 2.0) - # ... more operations - - return tmp_final -``` - -## 4. 参考示例 - -完整示例见: [tests/test_cases/test_expand.py](../../tests/test_cases/test_expand.py) - -主要展示了: -- 如何使用 `row_expand_div` 操作 -- 如何处理不同形状的输入 -- 如何编写 `compute_expected` 参考实现 diff --git a/src/fuzzer/README.md b/src/fuzzer/README.md index 1f0ee7d..3f0f784 100644 --- a/src/fuzzer/README.md +++ b/src/fuzzer/README.md @@ -1,18 +1,36 @@ -# 多内核模糊测试框架 +# 多内核模糊测试框架 (Multi-Kernel Fuzzing Framework) 这是一个用于生成和测试多内核 PyPTO 程序的自动化框架。该框架可以随机生成多个 InCore 内核函数,并通过 Orchestration 函数以不同的模式组合它们。 **注意**:`src/fuzzer` 是一个独立的框架,不依赖 `src/pto_test/fuzzing`。所有必要的代码都包含在此目录中。 +--- + +## 目录 + +1. [快速开始](#快速开始) +2. [代码结构](#代码结构) +3. [核心概念](#核心概念) +4. [配置指南](#配置指南) +5. [算子规则](#算子规则) +6. [使用示例](#使用示例) +7. [更新日志](#更新日志) + +--- + ## 快速开始 -### 基础示例 (基本算子) +### 基础示例 + ```bash -# 生成1个测试用例 (使用基础算子: add, mul, div, sqrt, exp等) -python src/fuzzer/example_multi_kernel.py --num-cases 1 +# 生成测试用例(使用默认配置) +python src/fuzzer/example_multi_kernel.py + +# 生成特定配置的测试用例 +python src/fuzzer/example_multi_kernel.py --config-index 0 -# 生成5个测试用例 -python src/fuzzer/example_multi_kernel.py --num-cases 5 +# 设置误差容忍度 +python src/fuzzer/example_multi_kernel.py --atol 1e-3 --rtol 1e-3 # 运行测试(只生成代码) pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only @@ -21,32 +39,28 @@ pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only --save-kernels --kernels-dir=/tmp/kernels ``` -**说明**: 基础示例默认使用以下算子: -- 二元: add, sub, mul, div, maximum, minimum -- 标量: adds, subs, muls, divs -- 一元: sqrt, rsqrt, exp, neg, recip, log, abs, relu +### 命令行参数 -### 高级示例 (row_expand, matmul 等高级算子) ```bash -# 生成使用高级算子的测试用例 -python src/fuzzer/example_multi_kernel.py --num-cases 3 --enable-advanced-ops +python src/fuzzer/example_multi_kernel.py [选项] -# 运行高级算子测试 -pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only +选项: + --config-index N 指定配置索引(从0开始),不指定则使用所有配置 + --output PATH 输出文件路径(默认: src/fuzzer/generated_tests/test_fuzz_multi_kernel.py) + --atol FLOAT 绝对误差容忍度(默认: 1e-4) + --rtol FLOAT 相对误差容忍度(默认: 1e-4) ``` -**高级算子包括**: -- Row expand: row_expand_add, row_expand_sub, row_expand_mul, row_expand_div -- Matrix: matmul +--- -**注意**: 使用 row_expand 算子时,请确保输入形状正确配置(第二个输入应为 [M, 1] 形状)。 +## 代码结构 -## 目录结构 +### 目录结构 ``` src/fuzzer/ # 独立的模糊测试框架 ├── __init__.py # 外部接口 -├── example_multi_kernel.py # 使用示例脚本 +├── example_multi_kernel.py # 使用示例脚本(主入口) ├── conftest.py # pytest 配置 ├── README.md # 本文档 ├── src/ # 内部实现 @@ -59,161 +73,329 @@ src/fuzzer/ # 独立的模糊测试框架 └── test_fuzz_multi_kernel.py # 生成的测试文件 ``` -## Op 组合规则 +### 核心模块说明 -**详细规则文档**: 请参考 [OP_RULES.md](OP_RULES.md) 获取完整的算子规则和组合模式。 +#### 1. fuzzer.py - OpFuzzer +操作符模糊生成器,负责: +- 定义所有支持的算子(二元、一元、标量、高级算子) +- 随机生成操作链 +- 处理数据约束(避免除零、正值约束等) +- 生成 NumPy/PyTorch 参考实现 -### 1. 操作符定义 +**主要类**: +- `OpSpec`: 算子规格定义 +- `OpFuzzer`: 操作链生成器 -操作符在 [src/fuzzer.py](src/fuzzer.py) 的 `OpFuzzer.__init__` 方法中定义。 +#### 2. kernel_generator.py - KernelGenerator +内核生成器,负责: +- 生成单个 InCore 内核函数 +- 支持不同数量和维度的输入 +- 生成 PyPTO 代码和 PyTorch 参考实现 +- 处理形状对齐约束 -**当前支持的操作**: -- **二元操作**: `block.add`, `block.sub`, `block.mul`, `block.div`, `block.maximum`, `block.minimum` -- **标量操作**: `block.adds`, `block.subs`, `block.muls`, `block.divs` -- **一元操作**: `block.sqrt`, `block.rsqrt`, `block.exp`, `block.neg`, `block.recip`, `block.log`, `block.abs`, `block.relu` -- **行广播操作** (高级): `block.row_expand_add`, `block.row_expand_sub`, `block.row_expand_mul`, `block.row_expand_div` -- **矩阵操作** (高级): `block.matmul` +#### 3. orchestrator_generator.py - OrchestratorGenerator +编排函数生成器,负责: +- 生成 Orchestration 函数 +- 支持三种组合模式:sequential、branching、mixed +- 管理内核间的数据流 -**启用高级操作**: -```python -# 在生成器中启用高级操作 -from src.fuzzer.src.fuzzer import OpFuzzer +#### 4. multi_kernel_test_generator.py - MultiKernelTestGenerator +测试用例生成器,负责: +- 生成完整的 PTOTestCase 类 +- 集成内核和编排函数 +- 生成 PyTorch 参考实现 +- 生成测试文件 -# 启用行广播和矩阵操作 -fuzzer = OpFuzzer(seed=42, enable_advanced_ops=True) -``` +--- -**添加新操作**: -```python -# 在 fuzzer.py 中定义新操作 -CUSTOM_OPS = [ - OpSpec("block.custom_op", ["tile", "tile"], "tile", {}, lambda a, b: custom_numpy_impl(a, b)), -] +## 核心概念 -# 在 __init__ 中添加 -self.ops = self.ops + CUSTOM_OPS -``` - -**操作符约束**: -- `avoid_zero`: 用于除法操作,确保分母不为零 -- `positive_only`: 用于 sqrt, log 等操作,确保输入为正数 -- `row_vec_shape`: 用于 row_expand 操作,要求第二个输入形状为 [M,1] - -更多详情请查看 [OP_RULES.md](OP_RULES.md) 中的完整算子列表和约束说明。 - -### 2. 内核生成规则 +### 1. 内核生成规则 每个 InCore 内核包含: -- **输入**: 1-3 个 tile 张量,**支持不同维度** -- **操作链**: 1-10 个随机操作 +- **输入**: 1-3 个 tile 张量,支持不同维度 +- **操作链**: 3-10 个随机操作 - **输出**: 1 个 tile 张量 -**输入张量配置**: -- 可以指定每个内核的输入数量和维度 -- 不同内核可以有不同数量的输入(1-3个) -- 每个输入可以有不同的形状(如 128x128, 64x64, 256x256) -- 如果不指定,框架会随机生成输入配置 - -**示例配置**: -```python -# 在 example_multi_kernel.py 中配置 -{ - "name": "test_case_name", - "num_kernels": 3, - "input_shapes_list": [ - [(128, 128), (64, 64)], # kernel_0: 2个不同维度的输入 - [(128, 128), (128, 128), (256, 256)], # kernel_1: 3个不同维度的输入 - [(256, 256)], # kernel_2: 1个输入 - ], -} -``` - **操作链生成规则**: 1. 从输入张量中随机选择操作数 -2. 随机选择一个操作符(add/sub/mul/div) +2. 随机选择一个操作符(根据权重) 3. 执行操作并生成中间结果 4. 中间结果可以被后续操作使用 5. 最后一个操作的结果作为内核输出 **示例**: ```python -# 生成的内核代码 - 不同维度的输入 @pl.function(type=pl.FunctionType.InCore) -def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[64, 64], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: +def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], + b: pl.Tensor[[64, 64], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) - tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) # 加载到输出大小 - tmp_0 = pl.add(tile_b, tile_a) # 操作1: b + a - tmp_1 = pl.mul(tmp_0, tile_a) # 操作2: tmp_0 * a - tmp_2 = pl.sub(tmp_1, tile_b) # 操作3: tmp_1 - b + tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) + tmp_0 = pl.add(tile_b, tile_a) # 操作1 + tmp_1 = pl.mul(tmp_0, tile_a) # 操作2 + tmp_2 = pl.sub(tmp_1, tile_b) # 操作3 return tmp_2 ``` -### 3. 内核组合模式 +### 2. 内核组合模式 + +#### Sequential (顺序模式) +内核按顺序执行,每个内核的输出作为下一个内核的输入。 -**Sequential (顺序模式)**: -- 内核按顺序执行 -- 每个内核的输出作为下一个内核的输入 ``` input → kernel_0 → kernel_1 → kernel_2 → output ``` -**Branching (分支模式)**: -- 多个内核并行执行 -- 使用 merge 内核合并结果 +#### Branching (分支模式) +多个内核并行执行,使用 merge 内核合并结果。 + ``` input → kernel_0 ↘ input → kernel_1 → merge → output input → kernel_2 ↗ ``` -**Mixed (混合模式)**: -- 结合顺序和分支执行 +#### Mixed (混合模式) +结合顺序和分支执行。 + ``` input → kernel_0 ↘ input → kernel_1 → merge → kernel_2 → kernel_3 → output ``` -### 4. 带参数的操作符 +### 3. 支持的算子 + +#### 基本算子(默认启用) +- **二元操作**: add, sub, mul, div, maximum, minimum +- **标量操作**: adds, subs, muls, divs +- **一元操作**: sqrt, rsqrt, exp, neg, recip, log, abs, relu + +#### 高级算子(需要启用) +- **行广播操作**: row_expand_add, row_expand_sub, row_expand_mul, row_expand_div +- **矩阵操作**: matmul + +详细算子规则请参考 [算子规则](#算子规则) 章节。 + +--- + +## 配置指南 + +### 配置结构 + +所有配置都在 `example_multi_kernel.py` 的 `all_configs` 列表中定义: + +```python +all_configs = [ + { + # 基本信息 + "name": "test_name", # 测试用例名称(必需) + "description": "测试描述", # 测试描述(可选) + + # 生成控制 + "num_instances": 1, # 从该配置生成的测试实例数量 + "seed": 42, # 随机种子 + + # 算子配置 + "enable_advanced_ops": False, # 是否启用高级算子 + + # 张量配置 + "tensor_init_type": "constant", # 张量初始化类型 + "shape": (128, 128), # 张量形状 + + # 内核配置 + "num_kernels": 3, # 内核数量 + "mode": "sequential", # 组合模式 + "num_ops_range": (3, 7), # 每个内核的操作数量范围 + "input_shapes_list": None, # 每个内核的输入形状列表(可选) + }, +] +``` + +### 配置字段详解 -框架支持带参数的操作符(如 transpose, reduce, reshape): +#### 1. 基本信息 +- **name** (必需): 测试用例的名称 +- **description** (可选): 测试用例的描述 + +#### 2. 生成控制 +- **num_instances** (默认: 1): 从该配置生成的测试实例数量 + - 如果设置为 N > 1,将生成 N 个测试用例 + - 每个实例使用不同的随机种子:`seed + instance_index` + - 实例名称自动添加索引:`name_0`, `name_1`, ..., `name_N-1` + +- **seed** (默认: 42): 随机种子,用于可重现性 + +#### 3. 算子配置 +- **enable_advanced_ops** (默认: False): 是否启用高级算子 + - False: 只使用基本算子 + - True: 包含高级算子(row_expand, matmul 等) + +#### 4. 张量配置 +- **tensor_init_type** (默认: "constant"): 张量初始化类型 + - `"constant"`: 每个张量使用不同的常量值(2.0, 2.5, 3.0, ...) + - `"random"`: 使用 `torch.randn` 生成随机正态分布值 + - `"range"`: 使用 `torch.rand` 生成 [0, 1) 范围内的随机值 + - `"normal"`: 使用 `torch.randn` 生成标准正态分布值 + - `"ones"`: 所有元素初始化为 1.0 + - `"zeros"`: 所有元素初始化为 0.0 + +- **shape** (默认: (128, 128)): 张量的形状 + +#### 5. 内核配置 +- **num_kernels** (默认: 3): 生成的内核数量 + +- **mode** (默认: "sequential"): 内核组合模式 + - `"sequential"`: 顺序执行 + - `"branching"`: 分支执行 + - `"mixed"`: 混合模式 + +- **num_ops_range** (默认: (3, 7)): 每个内核包含的操作数量范围 + +- **input_shapes_list** (可选): 每个内核的输入形状列表 + - 如果为 None,则自动生成 + - 示例:`[[(128, 128), (128, 128)], [(128, 128)]]` + +### 配置示例 + +#### 示例 1: 简单顺序执行 +```python +{ + "name": "simple_sequential", + "num_instances": 1, + "seed": 42, + "enable_advanced_ops": False, + "num_kernels": 2, + "mode": "sequential", + "shape": (128, 128), + "num_ops_range": (3, 5), + "tensor_init_type": "constant", + "input_shapes_list": [ + [(128, 128), (128, 128)], # kernel_0: 2个输入 + ], + "description": "简单顺序执行:2个内核" +} +``` +#### 示例 2: 生成多个随机测试实例 ```python -# 在 fuzzer.py 中添加 -OpSpec( - "block.transpose", - ["tile"], "tile", {}, - lambda a, dims: np.transpose(a, dims), - shape_transform=lambda shapes, params: tuple(shapes[0][i] for i in params['dims']), - param_generator=lambda shapes, rng: {'dims': (1, 0)}, - requires_params=True -) +{ + "name": "random_tests", + "num_instances": 5, # 生成5个测试用例 + "seed": 100, # 将使用种子 100, 101, 102, 103, 104 + "enable_advanced_ops": False, + "num_kernels": 3, + "mode": "branching", + "shape": (128, 128), + "num_ops_range": (4, 8), + "tensor_init_type": "random", + "input_shapes_list": None, + "description": "随机分支测试:生成5个不同的测试实例" +} ``` -**OpSpec 参数说明**: -- `name`: 操作名称(PyPTO API) -- `input_types`: 输入类型列表 -- `output_type`: 输出类型 -- `constraints`: 约束条件(如 `avoid_zero`, `positive_only`) -- `np_equivalent`: NumPy 参考实现 -- `shape_transform`: shape 变换函数(可选) -- `param_generator`: 参数生成函数(可选) -- `requires_params`: 是否需要参数 +--- -### 5. 常见算子组合模式 +## 算子规则 -参考 [OP_RULES.md](OP_RULES.md) 第 2.2 节获取完整的算子组合模式,包括: +### 形状对齐约束 -#### Softmax 模式 +**重要**: 所有 tensor 创建和 reshape 操作必须满足 32 字节对齐约束。 + +**规则**: +- 形状的尾轴(最后一个维度,即列数)必须满足: + 1. 尾轴 = 1, 或者 + 2. (尾轴 × sizeof(datatype)) % 32 == 0 + +**FP32 类型的有效尾轴值**: +- 尾轴 = 1(总是有效) +- 尾轴 % 8 == 0(因为 8 × 4 = 32) +- 有效值: 1, 8, 16, 24, 32, 40, 48, 56, 64, ..., 128, ... + +**示例**: ```python -# 1. Row max reduction -max_vals = pl.row_max(tile, tmp_tile) -# 2. Subtract max for numerical stability +# ✓ 有效的形状 +pl.tensor.create([128, 1], pl.FP32) # 尾轴=1 +pl.tensor.create([128, 8], pl.FP32) # 8*4=32, 对齐 +pl.tensor.create([128, 128], pl.FP32) # 128*4=512, 对齐 + +# ✗ 无效的形状 +pl.tensor.create([128, 3], pl.FP32) # 3*4=12, 不对齐 +pl.tensor.create([128, 5], pl.FP32) # 5*4=20, 不对齐 +``` + +### 算子分类 + +#### 1. Block Element-wise Binary Operations +| 算子名 | 输入类型 | 输出类型 | 约束 | NumPy等价 | +|--------|----------|----------|------|-----------| +| `block.add` | `tile, tile` | `tile` | 支持广播 | `a + b` | +| `block.sub` | `tile, tile` | `tile` | 支持广播 | `a - b` | +| `block.mul` | `tile, tile` | `tile` | 支持广播 | `a * b` | +| `block.div` | `tile, tile` | `tile` | 避免除零 | `a / b` | +| `block.maximum` | `tile, tile` | `tile` | 支持广播 | `np.maximum(a, b)` | +| `block.minimum` | `tile, tile` | `tile` | 支持广播 | `np.minimum(a, b)` | + +#### 2. Block Scalar Operations +| 算子名 | 输入类型 | 输出类型 | NumPy等价 | +|--------|----------|----------|-----------| +| `block.adds` | `tile, scalar` | `tile` | `a + s` | +| `block.subs` | `tile, scalar` | `tile` | `a - s` | +| `block.muls` | `tile, scalar` | `tile` | `a * s` | +| `block.divs` | `tile, scalar` | `tile` | `a / s` | + +#### 3. Block Unary Operations +| 算子名 | 输入类型 | 输出类型 | 约束 | NumPy等价 | +|--------|----------|----------|------|-----------| +| `block.neg` | `tile` | `tile` | - | `-a` | +| `block.exp` | `tile` | `tile` | 建议范围 [-10, 10] | `np.exp(a)` | +| `block.recip` | `tile` | `tile` | 避免除零 | `1.0 / a` | +| `block.sqrt` | `tile` | `tile` | 输入 ≥ 0 | `np.sqrt(a)` | +| `block.rsqrt` | `tile` | `tile` | 输入 > 0 | `1.0 / np.sqrt(a)` | +| `block.log` | `tile` | `tile` | 输入 > 0 | `np.log(a)` | +| `block.abs` | `tile` | `tile` | - | `np.abs(a)` | +| `block.relu` | `tile` | `tile` | - | `np.maximum(0, a)` | + +#### 4. Block Row/Column Broadcast Operations (高级) +| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | +|--------|----------|----------|----------|-----------| +| `block.row_expand_add` | `tile[M,N], tile[M,1]` | `tile[M,N]` | 第二个输入 [M,1] | `tile + row_vec` | +| `block.row_expand_sub` | `tile[M,N], tile[M,1]` | `tile[M,N]` | 第二个输入 [M,1] | `tile - row_vec` | +| `block.row_expand_mul` | `tile[M,N], tile[M,1]` | `tile[M,N]` | 第二个输入 [M,1] | `tile * row_vec` | +| `block.row_expand_div` | `tile[M,N], tile[M,1]` | `tile[M,N]` | 第二个输入 [M,1],避免除零 | `tile / row_vec` | + +#### 5. Block Matrix Operations (高级) +| 算子名 | 输入类型 | 输出类型 | 形状约束 | NumPy等价 | +|--------|----------|----------|----------|-----------| +| `block.matmul` | `tile, tile` | `tile` | `[M, K] @ [K, N] -> [M, N]` | `a @ b` | + +### 数据约束 + +1. **避免除零**: `div`, `divs`, `recip`, `row_expand_div` + - 确保分母绝对值 ≥ 0.01 + +2. **正值约束**: `sqrt`, `rsqrt`, `log` + - 确保输入 > 0 或使用 `abs(x) + 1e-6` + +3. **范围约束**: `exp` + - 建议输入范围 [-10, 10] 避免溢出 + +### 常见算子组合模式 + +#### Softmax 组件 +```python +# Step 1: Row max reduction +max_vals = pl.row_max(tile, tmp_tile) # [M,N] -> [M,1] + +# Step 2: Subtract max (数值稳定性) centered = pl.row_expand_sub(tile, max_vals) -# 3. Exponential + +# Step 3: Exponential exp_vals = pl.exp(centered) -# 4. Row sum + +# Step 4: Row sum sum_vals = pl.row_sum(exp_vals, tmp_tile) -# 5. Normalize + +# Step 5: Normalize output = pl.row_expand_div(exp_vals, sum_vals) ``` @@ -227,48 +409,64 @@ neg_part = pl.muls(tile, 0.01) output = pl.maximum(tile, neg_part) ``` -更多模式请参考 [OP_RULES.md](OP_RULES.md)。 +--- -## 命令行参数 +## 使用示例 ### 生成测试用例 ```bash -python src/fuzzer/example_multi_kernel.py [选项] +# 使用默认配置生成所有测试用例 +python src/fuzzer/example_multi_kernel.py -选项: - --num-cases N 生成的测试用例数量 (1-5,默认: 1) - --output PATH 输出文件路径 - --seed N 随机种子 +# 只生成第一个配置 +python src/fuzzer/example_multi_kernel.py --config-index 0 + +# 设置误差容忍度 +python src/fuzzer/example_multi_kernel.py --atol 1e-3 --rtol 1e-3 + +# 指定输出文件 +python src/fuzzer/example_multi_kernel.py --output my_tests.py + +# 组合使用 +python src/fuzzer/example_multi_kernel.py --config-index 1 --atol 1e-4 --rtol 1e-4 --output my_test.py ``` ### 运行测试 ```bash -pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py [选项] - -常用选项: - -v 显示详细输出 - -s 显示 print 输出 - --codegen-only 只生成代码,不执行 - --platform=PLATFORM 指定平台(如 a2a3sim) - --device=N 指定设备编号 - --save-kernels 保存生成的 C++ 代码 - --kernels-dir=DIR 指定保存目录 - --dump-passes 打印编译器优化 pass +# 运行所有测试 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v + +# 只生成代码,不执行 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only + +# 保存生成的 C++ 代码 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py -v --codegen-only --save-kernels --kernels-dir=/tmp/kernels + +# 运行特定测试 +pytest src/fuzzer/generated_tests/test_fuzz_multi_kernel.py::TestMultiKernelFuzzing::test_fuzz_sequential_simple -v ``` -## 生成的代码结构 +### 生成的代码结构 ```python class TestFuzzSequentialSimple(PTOTestCase): + rows = 128 + cols = 128 + + def __init__(self): + super().__init__() + self.config.atol = 1e-4 + self.config.rtol = 1e-4 + def get_name(self): return "fuzz_sequential_simple" def define_tensors(self): return [ - TensorSpec('a', [128, 128], DataType.FP32, is_input=True), - TensorSpec('b', [128, 128], DataType.FP32, is_input=True), + TensorSpec('a', [128, 128], DataType.FP32, init_value=2.0), + TensorSpec('b', [128, 128], DataType.FP32, init_value=2.5), TensorSpec('output', [128, 128], DataType.FP32, is_output=True), ] @@ -288,40 +486,45 @@ class TestFuzzSequentialSimple(PTOTestCase): return Program def compute_expected(self, tensors, params=None): - # NumPy 参考实现 + # PyTorch 参考实现 pass ``` -## 扩展框架 +--- -### 添加新操作符 +## 更新日志 -编辑 [fuzzer.py](fuzzer.py) 的 `OpFuzzer.__init__` 方法: +### 最新更新 -```python -# 在 OpFuzzer.__init__ 中 -self.ops = self.BLOCK_BINARY_OPS + self.BLOCK_SCALAR_OPS + self.BLOCK_UNARY_OPS +#### 新增功能 +- 支持多种张量初始化类型:constant, random, range, normal, ones, zeros +- 支持从单个配置生成多个测试实例(通过 `num_instances` 字段) +- 新增 `--config-index` 命令行参数,可以指定只生成某个配置的测试用例 +- 新增 `--atol` 和 `--rtol` 命令行参数,支持设置误差容忍度 +- 将所有配置参数移至 `all_configs` 结构,统一管理 -# 或者自定义操作集合 -custom_ops = [ - OpSpec("block.add", ["tile", "tile"], "tile", {}, lambda a, b: a + b), - OpSpec("block.maximum", ["tile", "tile"], "tile", {}, lambda a, b: np.maximum(a, b)), - OpSpec("block.sqrt", ["tile"], "tile", {"positive_only": True}, lambda a: np.sqrt(a)), -] -self.ops = custom_ops -``` +#### 重要变更 +- 将所有 golden 数据生成从 NumPy 替换为 PyTorch + - `_generate_numpy_reference` 重命名为 `_generate_torch_reference` + - `_get_numpy_operation` 重命名为 `_get_torch_operation` + - 所有中间计算使用 PyTorch 张量操作 -### 添加新组合模式 +- 简化命令行参数 + - 移除 `--num-cases`、`--seed`、`--enable-advanced-ops`、`--tensor-init` 参数 + - 所有配置现在通过 `all_configs` 结构管理 + - 保留 `--output`、`--config-index`、`--atol`、`--rtol` 参数 + +#### 修复 +- 修复生成的 golden.py 文件中缺少 torch 导入的问题 -在 [orchestrator_generator.py](orchestrator_generator.py) 中添加新的生成方法。 +--- ## 注意事项 1. **32字节对齐约束**: 所有 tensor 创建和 reshape 操作的形状必须满足32字节对齐 - - 形状尾轴(列数)必须是 1,或 `(cols * sizeof(dtype)) % 32 == 0` + - 形状尾轴(列数)必须是 1,或 `(cols * sizeof(dtype)) % 32 == 0` - FP32 类型有效的列数: 1, 8, 16, 24, 32, 40, 48, 56, 64, ..., 128, ... - Fuzzer 会自动验证并修正不对齐的形状 - - 详见 [OP_RULES.md](OP_RULES.md) 第 0 节 2. **张量形状**: 支持不同维度的输入张量,可以在配置中指定每个内核的输入形状 @@ -333,8 +536,31 @@ self.ops = custom_ops 6. **输入数量**: 每个内核支持 1-3 个输入张量,可以在配置中指定 +--- + +## 扩展框架 + +### 添加新操作符 + +编辑 `fuzzer.py` 的 `OpFuzzer.__init__` 方法: + +```python +# 在 OpFuzzer.__init__ 中 +custom_ops = [ + OpSpec("block.custom_op", ["tile", "tile"], "tile", {}, + lambda a, b: custom_numpy_impl(a, b)), +] +self.ops = self.ops + custom_ops +``` + +### 添加新组合模式 + +在 `orchestrator_generator.py` 中添加新的生成方法。 + +--- + ## 参考文件 - [tests/test_cases/test_matmul.py](../../tests/test_cases/test_matmul.py): PTOTestCase 使用模式 - [src/fuzzer/src/fuzzer.py](src/fuzzer.py): OpFuzzer 操作生成逻辑和操作符定义 -- [example_multi_kernel.py](example_multi_kernel.py): 配置示例,包括如何指定不同维度的输入 +- [example_multi_kernel.py](example_multi_kernel.py): 配置示例 diff --git a/src/fuzzer/example_multi_kernel.py b/src/fuzzer/example_multi_kernel.py index 6bf90b1..20b0868 100644 --- a/src/fuzzer/example_multi_kernel.py +++ b/src/fuzzer/example_multi_kernel.py @@ -27,24 +27,28 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: - # 生成默认的5个测试用例 + # 使用默认配置生成测试用例 python example_multi_kernel.py - # 生成3个测试用例 - python example_multi_kernel.py --num-cases 3 + # 指定配置索引(从0开始) + python example_multi_kernel.py --config-index 0 # 指定输出文件 python example_multi_kernel.py --output custom_test.py + + # 设置误差容忍度 + python example_multi_kernel.py --atol 1e-3 --rtol 1e-3 + + # 组合使用 + python example_multi_kernel.py --config-index 1 --atol 1e-4 --rtol 1e-4 --output my_test.py """ ) parser.add_argument( - "--num-cases", + "--config-index", type=int, - default=1, - choices=range(1, 6), - metavar="N", - help="生成的测试用例数量 (1-5,默认: 5)" + default=0, + help="指定要使用的配置索引(从0开始),如果不指定则使用所有配置" ) parser.add_argument( @@ -55,16 +59,17 @@ def main(): ) parser.add_argument( - "--seed", - type=int, - default=4, - help="随机种子,用于可重现性 (默认: 42)" + "--atol", + type=float, + default=1e-4, + help="绝对误差容忍度 (默认: 1e-5)" ) parser.add_argument( - "--enable-advanced-ops", - action="store_true", - help="启用高级算子 (row_expand, matmul等)" + "--rtol", + type=float, + default=1e-4, + help="相对误差容忍度 (默认: 1e-5)" ) args = parser.parse_args() @@ -75,98 +80,223 @@ def main(): else: output_path = str(_SCRIPT_DIR / "generated_tests" / "test_fuzz_multi_kernel.py") - print(f"多内核模糊测试生成器") - print(f"=" * 60) - print(f"测试用例数量: {args.num_cases}") - print(f"随机种子: {args.seed}") - print(f"启用高级算子: {'是' if args.enable_advanced_ops else '否'}") - print(f"输出文件: {output_path}") - print(f"=" * 60) - print() - - # 定义5种不同配置的测试用例 + # 定义不同配置的测试用例 + # 每个配置可以生成多个测试实例(通过 num_instances 控制) all_configs = [ { "name": "fuzz_sequential_simple", + "num_instances": 1, # 从这个配置生成1个测试用例 + "seed": 4, + "enable_advanced_ops": False, "num_kernels": 2, "mode": "sequential", "shape": (128, 128), "num_ops_range": (3, 5), + "tensor_init_type": "constant", "input_shapes_list": [ [(128, 128), (128, 128)], # kernel_0: 2个相同维度的输入 - [(128, 128), (128, 128), (128, 128)], # kernel_1: 3个相同维度的输入 ], "description": "简单顺序执行:2个内核,相同维度输入" }, - { - "name": "fuzz_branching_parallel", - "num_kernels": 3, - "mode": "branching", - "shape": (128, 128), - "num_ops_range": (4, 6), - "input_shapes_list": [ - [(128, 128), (128, 128)], # kernel_0: 2个相同维度 - [(128, 128), (128, 128)], # kernel_1: 2个相同维度 - [(128, 128)], # kernel_2: 1个输入 - ], - "description": "分支并行执行:3个内核,相同维度输入" - }, - { - "name": "fuzz_mixed_complex", - "num_kernels": 4, - "mode": "mixed", - "shape": (128, 128), - "num_ops_range": (5, 8), - "input_shapes_list": None, # 使用随机生成 - "description": "混合模式:前2个并行,后2个顺序,随机输入" - }, - { - "name": "fuzz_sequential_deep", - "num_kernels": 5, - "mode": "sequential", - "shape": (128, 128), - "num_ops_range": (6, 10), - "input_shapes_list": None, # 使用随机生成 - "description": "深度顺序执行:5个内核链式调用,随机输入" - }, - { - "name": "fuzz_branching_wide", - "num_kernels": 4, - "mode": "branching", - "shape": (128, 128), - "num_ops_range": (4, 7), - "input_shapes_list": [ - [(128, 128), (128, 128), (128, 128)], # kernel_0: 3个相同维度 - [(128, 128)], # kernel_1: 1个输入 - [(128, 128), (128, 128)], # kernel_2: 2个相同维度 - [(128, 128), (128, 128)], # kernel_3: 2个相同维度 - ], - "description": "宽分支执行:4个内核,统一维度输入" - }, + # { + # "name": "fuzz_branching_parallel", + # "num_instances": 1, # 从这个配置生成2个测试用例 + # "seed": 42, + # "enable_advanced_ops": False, + # "num_kernels": 3, + # "mode": "branching", + # "shape": (128, 128), + # "num_ops_range": (4, 6), + # "tensor_init_type": "random", + # "input_shapes_list": [ + # [(128, 128), (128, 128)], # kernel_0: 2个相同维度 + # [(128, 128), (128, 128)], # kernel_1: 2个相同维度 + # [(128, 128)], # kernel_2: 1个输入 + # ], + # "description": "分支并行执行:3个内核,相同维度输入" + # }, + # { + # "name": "fuzz_mixed_complex", + # "num_instances": 1, + # "seed": 100, + # "enable_advanced_ops": False, + # "num_kernels": 4, + # "mode": "mixed", + # "shape": (128, 128), + # "num_ops_range": (5, 8), + # "tensor_init_type": "range", + # "input_shapes_list": None, # 使用随机生成 + # "description": "混合模式:前2个并行,后2个顺序,随机输入" + # }, + # { + # "name": "fuzz_sequential_deep", + # "num_instances": 1, + # "seed": 200, + # "enable_advanced_ops": False, + # "num_kernels": 5, + # "mode": "sequential", + # "shape": (128, 128), + # "num_ops_range": (6, 10), + # "tensor_init_type": "normal", + # "input_shapes_list": None, # 使用随机生成 + # "description": "深度顺序执行:5个内核链式调用,随机输入" + # }, + # { + # "name": "fuzz_branching_wide", + # "num_instances": 1, + # "seed": 300, + # "enable_advanced_ops": False, + # "num_kernels": 4, + # "mode": "branching", + # "shape": (128, 128), + # "num_ops_range": (4, 7), + # "tensor_init_type": "ones", + # "input_shapes_list": [ + # [(128, 128), (128, 128), (128, 128)], # kernel_0: 3个相同维度 + # [(128, 128)], # kernel_1: 1个输入 + # [(128, 128), (128, 128)], # kernel_2: 2个相同维度 + # [(128, 128), (128, 128)], # kernel_3: 2个相同维度 + # ], + # "description": "宽分支执行:4个内核,统一维度输入" + # }, ] - # 根据 num_cases 选择配置 - selected_configs = all_configs[:args.num_cases] + # 根据 config_index 选择配置 + if args.config_index is not None: + if args.config_index < 0 or args.config_index >= len(all_configs): + print(f"错误: 配置索引 {args.config_index} 超出范围 (0-{len(all_configs)-1})") + return + selected_configs = [all_configs[args.config_index]] + else: + selected_configs = all_configs + + # 计算总测试用例数 + total_test_cases = sum(config.get("num_instances", 1) for config in selected_configs) + + print(f"多内核模糊测试生成器") + print(f"=" * 60) + print(f"配置数量: {len(selected_configs)}") + print(f"总测试用例数: {total_test_cases}") + print(f"输出文件: {output_path}") + print(f"绝对误差容忍度 (atol): {args.atol}") + print(f"相对误差容忍度 (rtol): {args.rtol}") + print(f"=" * 60) + print() print("将生成以下测试用例:") print() - for i, config in enumerate(selected_configs, 1): - print(f"{i}. {config['name']}") + test_case_num = 1 + for config_idx, config in enumerate(selected_configs): + num_instances = config.get("num_instances", 1) + print(f"配置 {config_idx}: {config['name']}") print(f" {config['description']}") + print(f" 实例数量: {num_instances}") + print(f" 随机种子: {config.get('seed', 42)}") + print(f" 启用高级算子: {'是' if config.get('enable_advanced_ops', False) else '否'}") + print(f" 张量初始化: {config.get('tensor_init_type', 'constant')}") + if num_instances > 1: + print(f" 将生成测试用例: {test_case_num} - {test_case_num + num_instances - 1}") + else: + print(f" 将生成测试用例: {test_case_num}") + test_case_num += num_instances print() - # 创建生成器 - generator = MultiKernelTestGenerator(seed=args.seed, enable_advanced_ops=args.enable_advanced_ops) + # 展开配置,为每个实例创建一个测试用例 + expanded_test_configs = [] + for config in selected_configs: + num_instances = config.get("num_instances", 1) + base_seed = config.get("seed", 42) + + for instance_idx in range(num_instances): + # 为每个实例创建一个配置副本 + test_config = config.copy() + + # 如果有多个实例,在名称后添加索引 + if num_instances > 1: + test_config["name"] = f"{config['name']}_{instance_idx}" + # 每个实例使用不同的种子 + test_config["seed"] = base_seed + instance_idx + + expanded_test_configs.append(test_config) # 生成测试文件 print("正在生成测试文件...") - generator.generate_test_file( - output_path=output_path, - test_configs=selected_configs, - ) + + # 为每个配置创建独立的生成器(使用各自的种子和配置) + all_test_cases = [] + for test_config in expanded_test_configs: + generator = MultiKernelTestGenerator( + seed=test_config.get("seed", 42), + enable_advanced_ops=test_config.get("enable_advanced_ops", False), + tensor_init_type=test_config.get("tensor_init_type", "constant") + ) + + test_code = generator.generate_test_case( + test_name=test_config["name"], + num_kernels=test_config.get("num_kernels", 3), + orchestration_mode=test_config.get("mode", "sequential"), + shape=test_config.get("shape", (128, 128)), + num_ops_range=test_config.get("num_ops_range", (3, 7)), + input_shapes_list=test_config.get("input_shapes_list"), + tensor_init_type=test_config.get("tensor_init_type"), + atol=args.atol, + rtol=args.rtol, + ) + all_test_cases.append(test_code) + + # 生成文件头部 + file_header = '''""" +自动生成的多内核模糊测试用例 + +该文件由 MultiKernelTestGenerator 自动生成。 +包含多个测试用例,每个测试用例包含多个 InCore 内核和一个 Orchestration 函数。 +""" + +import sys +from pathlib import Path +from typing import Any, List + +import torch +import pytest + +from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec + +# 添加 pypto 到路径 +_FRAMEWORK_ROOT = Path(__file__).parent.parent.parent.parent +_PYPTO_ROOT = _FRAMEWORK_ROOT / "3rdparty" / "pypto" / "python" +if _PYPTO_ROOT.exists() and str(_PYPTO_ROOT) not in sys.path: + sys.path.insert(0, str(_PYPTO_ROOT)) + + +''' + + # 生成测试套件类 + test_suite = f''' +class TestMultiKernelFuzzing: + """多内核模糊测试套件""" + +''' + + # 为每个测试用例添加测试方法 + for test_config in expanded_test_configs: + test_name = test_config["name"] + test_suite += f''' def test_{test_name}(self, test_runner): + """测试 {test_name}""" + test_case = Test{test_name.title().replace("_", "")}() + result = test_runner.run(test_case) + assert result.passed, f"测试失败: {{result.error}}" + +''' + + # 写入文件 + with open(output_path, 'w', encoding='utf-8') as f: + f.write(file_header) + f.write('\n\n'.join(all_test_cases)) + f.write('\n\n') + f.write(test_suite) print() - print(f"✓ 成功生成 {args.num_cases} 个测试用例") + print(f"✓ 成功生成 {len(expanded_test_configs)} 个测试用例") print(f"✓ 输出文件: {output_path}") print() print("运行测试:") diff --git a/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py index 0046f07..b2fb972 100644 --- a/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py +++ b/src/fuzzer/generated_tests/test_fuzz_multi_kernel.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Any, List -import numpy as np +import torch import pytest from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec @@ -28,10 +28,13 @@ class TestFuzzSequentialSimple(PTOTestCase): 内核数量: 2 """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.rows = 128 - self.cols = 128 + rows = 128 + cols = 128 + + def __init__(self): + super().__init__() + self.config.atol = 0.0001 + self.config.rtol = 0.0001 def get_name(self) -> str: return 'fuzz_sequential_simple' @@ -40,7 +43,6 @@ def define_tensors(self) -> List[TensorSpec]: return [ TensorSpec('a', [128, 128], DataType.FP32, init_value=2.0), TensorSpec('b', [128, 128], DataType.FP32, init_value=2.5), - TensorSpec('c', [128, 128], DataType.FP32, init_value=3.0), TensorSpec('output', [128, 128], DataType.FP32, is_output=True), ] @@ -61,12 +63,11 @@ def kernel_0(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], p return result @pl.function(type=pl.FunctionType.InCore) - def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[128, 128], pl.FP32], output: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], output: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: tile_a = pl.load(a, offsets=[0, 0], shapes=[128, 128]) tile_b = pl.load(b, offsets=[0, 0], shapes=[128, 128]) - tile_c = pl.load(c, offsets=[0, 0], shapes=[128, 128]) - tmp_0 = pl.add(tile_a, tile_c) - tmp_1 = pl.muls(tile_b, 0.5) + tmp_0 = pl.div(tile_a, tile_b) + tmp_1 = pl.muls(tmp_0, 0.5) tmp_2 = pl.rsqrt(tmp_1) tmp_3 = pl.exp(tmp_0) tmp_4 = pl.add(tmp_2, tmp_3) @@ -74,21 +75,24 @@ def kernel_1(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], p return result @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32], c: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: + def orchestrator(self, a: pl.Tensor[[128, 128], pl.FP32], b: pl.Tensor[[128, 128], pl.FP32]) -> pl.Tensor[[128, 128], pl.FP32]: result_0 = self.kernel_0(a, b) - result_1 = self.kernel_1(result_0, b, c) + result_1 = self.kernel_1(result_0, b) return result_1 return FuzzSequentialSimpleProgram def compute_expected(self, tensors, params=None): - """使用 NumPy 计算期望输出""" - def _numpy_kernel_0(a, b): - """NumPy 实现: kernel_0""" + """使用 Torch 计算期望输出""" + # 将 numpy 数组转换为 torch 张量(仅在输入边界) + torch_tensors = {name: torch.from_numpy(arr) for name, arr in tensors.items() if not name.endswith('output')} + + def _torch_kernel_0(a, b): + """Torch 实现: kernel_0""" # 创建变量环境 env = {} - env['tile_a'] = a.copy() - env['tile_b'] = b.copy() + env['tile_a'] = a.clone() + env['tile_b'] = b.clone() # 执行操作链 env['tmp_0'] = env['tile_b'] - 1.0 @@ -97,28 +101,31 @@ def _numpy_kernel_0(a, b): env['tmp_3'] = env['tmp_0'] + env['tmp_2'] return env['tmp_3'] - def _numpy_kernel_1(a, b, c): - """NumPy 实现: kernel_1""" + def _torch_kernel_1(a, b): + """Torch 实现: kernel_1""" # 创建变量环境 env = {} - env['tile_a'] = a.copy() - env['tile_b'] = b.copy() - env['tile_c'] = c.copy() + env['tile_a'] = a.clone() + env['tile_b'] = b.clone() # 执行操作链 - env['tmp_0'] = env['tile_a'] + env['tile_c'] - env['tmp_1'] = env['tile_b'] * 0.5 - env['tmp_1'] = np.abs(env['tmp_1']) + 1e-6 - env['tmp_2'] = 1.0 / np.sqrt(env['tmp_1']) - env['tmp_3'] = np.exp(np.clip(env['tmp_0'], -10, 10)) + env['tile_a'] = torch.where(torch.abs(env['tile_a']) < 0.01, torch.tensor(1.0), env['tile_a']) + env['tile_b'] = torch.where(torch.abs(env['tile_b']) < 0.01, torch.tensor(1.0), env['tile_b']) + env['tmp_0'] = env['tile_a'] / env['tile_b'] + env['tmp_1'] = env['tmp_0'] * 0.5 + env['tmp_1'] = torch.abs(env['tmp_1']) + 1e-6 + env['tmp_2'] = torch.rsqrt(env['tmp_1']) + env['tmp_3'] = torch.exp(torch.clamp(env['tmp_0'], -10, 10)) env['tmp_4'] = env['tmp_2'] + env['tmp_3'] return env['tmp_4'] # 顺序执行模式 - result_0 = _numpy_kernel_0(tensors['a'], tensors['b']) - result_1 = _numpy_kernel_1(result_0, tensors['b'], tensors['c']) - tensors['output'][:] = result_1 + result_0 = _torch_kernel_0(torch_tensors['a'], torch_tensors['b']) + result_1 = _torch_kernel_1(result_0, torch_tensors['b']) + # 将结果转换回 numpy 并写入输出 + tensors['output'][:] = result_1.numpy() + class TestMultiKernelFuzzing: diff --git a/src/fuzzer/src/kernel_generator.py b/src/fuzzer/src/kernel_generator.py index 160bbd0..c58c9fb 100644 --- a/src/fuzzer/src/kernel_generator.py +++ b/src/fuzzer/src/kernel_generator.py @@ -139,9 +139,9 @@ def _generate_kernel_code( f" def {kernel_name}(self, {', '.join(params)}) -> pl.Tensor[[{rows}, {cols}], pl.FP32]:", ] - # 加载输入张量 - 使用输出形状作为加载大小 + # 加载输入张量 - 使用每个输入自己的实际形状 for name, (r, c) in inputs: - code_lines.append(f" tile_{name} = pl.load({name}, offsets=[0, 0], shapes=[{rows}, {cols}])") + code_lines.append(f" tile_{name} = pl.load({name}, offsets=[0, 0], shapes=[{r}, {c}])") # 生成操作链 for op_dict in op_chain: diff --git a/src/fuzzer/src/multi_kernel_test_generator.py b/src/fuzzer/src/multi_kernel_test_generator.py index bb43e51..8afe447 100644 --- a/src/fuzzer/src/multi_kernel_test_generator.py +++ b/src/fuzzer/src/multi_kernel_test_generator.py @@ -21,19 +21,67 @@ class MultiKernelTestGenerator: """生成多内核测试用例的生成器""" - def __init__(self, seed: Optional[int] = None, enable_advanced_ops: bool = False): + def __init__( + self, + seed: Optional[int] = None, + enable_advanced_ops: bool = False, + tensor_init_type: str = "constant" + ): """初始化测试生成器 Args: seed: 随机种子,用于可重现性 enable_advanced_ops: 启用高级算子(row_expand, matmul等) + tensor_init_type: 张量初始化类型,可选值: + - "constant": 常量初始化(默认) + - "random": 随机初始化 + - "range": 范围初始化(0到1之间) + - "normal": 正态分布初始化 """ self.seed = seed self.enable_advanced_ops = enable_advanced_ops + self.tensor_init_type = tensor_init_type self.kernel_gen = KernelGenerator(seed=seed, enable_advanced_ops=enable_advanced_ops) self.orch_gen = OrchestratorGenerator(seed=seed) self.fuzzer = OpFuzzer(seed=seed, enable_advanced_ops=enable_advanced_ops) + def _generate_tensor_init_value(self, tensor_index: int, init_type: str = None) -> str: + """生成张量初始化值的代码 + + Args: + tensor_index: 张量索引(用于生成不同的常量值) + init_type: 初始化类型,如果为None则使用self.tensor_init_type + + Returns: + 初始化值的代码字符串 + """ + if init_type is None: + init_type = self.tensor_init_type + + if init_type == "constant": + # 常量初始化:每个张量使用不同的常量 + init_val = 2.0 + tensor_index * 0.5 + return f"init_value={init_val}" + elif init_type == "random": + # 随机初始化:使用lambda函数生成随机数 + return "init_value=lambda shape: torch.randn(shape, dtype=torch.float32).numpy()" + elif init_type == "range": + # 范围初始化:0到1之间的均匀分布 + return "init_value=lambda shape: torch.rand(shape, dtype=torch.float32).numpy()" + elif init_type == "normal": + # 正态分布初始化:均值0,标准差1 + return "init_value=lambda shape: torch.randn(shape, dtype=torch.float32).numpy()" + elif init_type == "ones": + # 全1初始化 + return "init_value=1.0" + elif init_type == "zeros": + # 全0初始化(不推荐用于输入,可能导致除零) + return "init_value=0.0" + else: + # 默认使用常量 + init_val = 2.0 + tensor_index * 0.5 + return f"init_value={init_val}" + def _compute_output_shapes_for_sequential( self, num_kernels: int, @@ -201,6 +249,9 @@ def generate_test_case( shape: Tuple[int, int] = (128, 128), num_ops_range: Tuple[int, int] = (3, 7), input_shapes_list: Optional[List[List[Tuple[int, int]]]] = None, + tensor_init_type: Optional[str] = None, + atol: float = 1e-5, + rtol: float = 1e-5, ) -> str: """生成完整的测试用例代码 @@ -211,6 +262,9 @@ def generate_test_case( shape: 张量形状 num_ops_range: 每个内核的操作数量范围 input_shapes_list: 每个内核的输入形状列表(可选) + tensor_init_type: 张量初始化类型(可选,如果不指定则使用全局配置) + atol: 绝对误差容忍度 + rtol: 相对误差容忍度 Returns: 完整的测试用例代码字符串 @@ -243,51 +297,54 @@ def generate_test_case( else: raise ValueError(f"未知的组合模式: {orchestration_mode}") - # 生成 NumPy 参考实现 - numpy_code = self._generate_numpy_reference(kernels, orch_info) + # 生成 Torch 参考实现 + torch_code = self._generate_torch_reference(kernels, orch_info) # 生成完整的测试类 test_code = self._generate_test_class( test_name=test_name, kernels=kernels, orch_info=orch_info, - numpy_code=numpy_code, + torch_code=torch_code, shape=shape, + tensor_init_type=tensor_init_type, + atol=atol, + rtol=rtol, ) return test_code - def _generate_numpy_reference( + def _generate_torch_reference( self, kernels: List[Dict[str, Any]], orch_info: Dict[str, Any], ) -> str: - """生成 NumPy 参考实现代码 + """生成 Torch 参考实现代码 Args: kernels: 内核信息列表 orch_info: Orchestration 信息 Returns: - NumPy 参考实现代码字符串 + Torch 参考实现代码字符串 """ code_lines = [] - # 为每个内核生成 NumPy 函数 + # 为每个内核生成 Torch 函数 for kernel in kernels: kernel_name = kernel["name"] input_names = [inp[0] for inp in kernel["inputs"]] op_chain = kernel["op_chain"] # 嵌套函数不需要 self 参数 - code_lines.append(f" def _numpy_{kernel_name}({', '.join(input_names)}):") - code_lines.append(f" \"\"\"NumPy 实现: {kernel_name}\"\"\"") + code_lines.append(f" def _torch_{kernel_name}({', '.join(input_names)}):") + code_lines.append(f" \"\"\"Torch 实现: {kernel_name}\"\"\"") - # 生成 NumPy 操作 + # 生成 Torch 操作 code_lines.append(f" # 创建变量环境") code_lines.append(f" env = {{}}") for name in input_names: - code_lines.append(f" env['tile_{name}'] = {name}.copy()") + code_lines.append(f" env['tile_{name}'] = {name}.clone()") code_lines.append(f"") code_lines.append(f" # 执行操作链") @@ -308,32 +365,32 @@ def _generate_numpy_reference( if "avoid_zero" in op.constraints and op.constraints["avoid_zero"]: for i, inp in enumerate(inputs): if inp.startswith("tile_") or inp.startswith("tmp_"): - code_lines.append(f" env['{inp}'] = np.where(np.abs(env['{inp}']) < 0.01, 1.0, env['{inp}'])") + code_lines.append(f" env['{inp}'] = torch.where(torch.abs(env['{inp}']) < 0.01, torch.tensor(1.0), env['{inp}'])") if "positive_only" in op.constraints and op.constraints["positive_only"]: for i, inp in enumerate(inputs): if inp.startswith("tile_") or inp.startswith("tmp_"): - code_lines.append(f" env['{inp}'] = np.abs(env['{inp}']) + 1e-6") + code_lines.append(f" env['{inp}'] = torch.abs(env['{inp}']) + 1e-6") # 生成操作 if op.np_equivalent: - np_expr = self._get_numpy_operation(op.name, input_vals) - code_lines.append(f" env['{output}'] = {np_expr}") + torch_expr = self._get_torch_operation(op.name, input_vals) + code_lines.append(f" env['{output}'] = {torch_expr}") code_lines.append(f" return env['{op_chain[-1]['output']}']") code_lines.append(f"") return "\n".join(code_lines) - def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: - """将 PyPTO 操作名转换为 NumPy 操作表达式 + def _get_torch_operation(self, op_name: str, input_vals: List[str]) -> str: + """将 PyPTO 操作名转换为 Torch 操作表达式 Args: op_name: PyPTO 操作名 (如 "block.add") input_vals: 输入值列表 Returns: - NumPy 操作表达式字符串 + Torch 操作表达式字符串 """ # 根据操作类型生成表达式 # 二元操作 @@ -346,9 +403,9 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: elif op_name == "block.div": return f"{input_vals[0]} / {input_vals[1]}" elif op_name == "block.maximum": - return f"np.maximum({input_vals[0]}, {input_vals[1]})" + return f"torch.maximum({input_vals[0]}, {input_vals[1]})" elif op_name == "block.minimum": - return f"np.minimum({input_vals[0]}, {input_vals[1]})" + return f"torch.minimum({input_vals[0]}, {input_vals[1]})" # 标量操作 elif op_name == "block.adds": return f"{input_vals[0]} + {input_vals[1]}" @@ -360,21 +417,21 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: return f"{input_vals[0]} / {input_vals[1]}" # 一元操作 elif op_name == "block.sqrt": - return f"np.sqrt({input_vals[0]})" + return f"torch.sqrt({input_vals[0]})" elif op_name == "block.rsqrt": - return f"1.0 / np.sqrt({input_vals[0]})" + return f"torch.rsqrt({input_vals[0]})" elif op_name == "block.exp": - return f"np.exp(np.clip({input_vals[0]}, -10, 10))" + return f"torch.exp(torch.clamp({input_vals[0]}, -10, 10))" elif op_name == "block.neg": return f"-{input_vals[0]}" elif op_name == "block.recip": - return f"1.0 / {input_vals[0]}" + return f"torch.reciprocal({input_vals[0]})" elif op_name == "block.log": - return f"np.log({input_vals[0]})" + return f"torch.log({input_vals[0]})" elif op_name == "block.abs": - return f"np.abs({input_vals[0]})" + return f"torch.abs({input_vals[0]})" elif op_name == "block.relu": - return f"np.maximum(0, {input_vals[0]})" + return f"torch.relu({input_vals[0]})" # Row expand 操作 elif op_name == "block.row_expand_add": return f"{input_vals[0]} + {input_vals[1]}" # Broadcasting @@ -386,7 +443,7 @@ def _get_numpy_operation(self, op_name: str, input_vals: List[str]) -> str: return f"{input_vals[0]} / {input_vals[1]}" # 矩阵操作 elif op_name == "block.matmul": - return f"{input_vals[0]} @ {input_vals[1]}" + return f"torch.matmul({input_vals[0]}, {input_vals[1]})" else: return f"# 未知操作: {op_name}" @@ -395,8 +452,11 @@ def _generate_test_class( test_name: str, kernels: List[Dict[str, Any]], orch_info: Dict[str, Any], - numpy_code: str, + torch_code: str, shape: Tuple[int, int], + tensor_init_type: Optional[str] = None, + atol: float = 1e-5, + rtol: float = 1e-5, ) -> str: """生成完整的测试类代码 @@ -404,8 +464,11 @@ def _generate_test_class( test_name: 测试名称 kernels: 内核信息列表 orch_info: Orchestration 信息 - numpy_code: NumPy 参考实现代码 + torch_code: Torch 参考实现代码 shape: 张量形状 + tensor_init_type: 张量初始化类型(可选,如果不指定则使用全局配置) + atol: 绝对误差容忍度 + rtol: 相对误差容忍度 Returns: 完整的测试类代码 @@ -440,10 +503,13 @@ def _generate_test_class( f" 内核数量: {len(kernels)}", f" \"\"\"", f"", - f" def __init__(self, **kwargs):", - f" super().__init__(**kwargs)", - f" self.rows = {rows}", - f" self.cols = {cols}", + f" rows = {rows}", + f" cols = {cols}", + f"", + f" def __init__(self):", + f" super().__init__()", + f" self.config.atol = {atol}", + f" self.config.rtol = {rtol}", f"", f" def get_name(self) -> str:", f" return '{test_name}'", @@ -452,11 +518,11 @@ def _generate_test_class( f" return [", ] - # 定义输入张量 - 使用实际形状 - for inp_name in input_list: - init_val = 2.0 + input_list.index(inp_name) * 0.5 + # 定义输入张量 - 使用实际形状和配置的初始化类型 + for idx, inp_name in enumerate(input_list): inp_shape = input_shapes_map[inp_name] - code_lines.append(f" TensorSpec('{inp_name}', [{inp_shape[0]}, {inp_shape[1]}], DataType.FP32, init_value={init_val}),") + init_code = self._generate_tensor_init_value(idx, tensor_init_type) + code_lines.append(f" TensorSpec('{inp_name}', [{inp_shape[0]}, {inp_shape[1]}], DataType.FP32, {init_code}),") # 定义输出张量 - 使用实际输出形状 code_lines.append(f" TensorSpec('output', [{output_shape[0]}, {output_shape[1]}], DataType.FP32, is_output=True),") @@ -497,12 +563,15 @@ def _generate_test_class( code_lines.append(f" return {test_name.replace('_', ' ').title().replace(' ', '')}Program") code_lines.append(f"") - # 添加 NumPy 参考实现 + # 添加 Torch 参考实现 code_lines.append(f" def compute_expected(self, tensors, params=None):") - code_lines.append(f" \"\"\"使用 NumPy 计算期望输出\"\"\"") - # numpy_code 包含嵌套函数定义,需要添加到 compute_expected 内部,所以需要额外缩进 - numpy_lines = numpy_code.split('\n') - for line in numpy_lines: + code_lines.append(f" \"\"\"使用 Torch 计算期望输出\"\"\"") + code_lines.append(f" # 将 numpy 数组转换为 torch 张量(仅在输入边界)") + code_lines.append(f" torch_tensors = {{name: torch.from_numpy(arr) for name, arr in tensors.items() if not name.endswith('output')}}") + code_lines.append(f"") + # torch_code 包含嵌套函数定义,需要添加到 compute_expected 内部,所以需要额外缩进 + torch_lines = torch_code.split('\n') + for line in torch_lines: if line.strip(): # 跳过空行 code_lines.append(f" {line}") # 添加额外的4个空格缩进 else: @@ -520,20 +589,21 @@ def _generate_test_class( if i > 0 and result_var: # 第一个输入使用前一个结果(变量名) kernel_inputs[0] = result_var - # 构建参数列表:第一个是变量,其他从 tensors 获取 + # 构建参数列表:第一个是变量,其他从 torch_tensors 获取 inputs_parts = [kernel_inputs[0]] for inp in kernel_inputs[1:]: - inputs_parts.append(f"tensors['{inp}']") + inputs_parts.append(f"torch_tensors['{inp}']") inputs_str = ", ".join(inputs_parts) else: - # 第一个内核,所有输入都从 tensors 获取 - inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + # 第一个内核,所有输入都从 torch_tensors 获取 + inputs_str = ", ".join([f"torch_tensors['{inp}']" for inp in kernel_inputs]) result_var = f"result_{i}" # 调用嵌套函数不需要 self - code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") + code_lines.append(f" {result_var} = _torch_{kernel_name}({inputs_str})") - code_lines.append(f" tensors['output'][:] = {result_var}") + code_lines.append(f" # 将结果转换回 numpy 并写入输出") + code_lines.append(f" tensors['output'][:] = {result_var}.numpy()") elif orch_info["mode"] == "branching": code_lines.append(f" # 分支执行模式") @@ -544,20 +614,22 @@ def _generate_test_class( result_var = f"branch_{i}" branch_results.append(result_var) - inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + inputs_str = ", ".join([f"torch_tensors['{inp}']" for inp in kernel_inputs]) # 调用嵌套函数不需要 self - code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") + code_lines.append(f" {result_var} = _torch_{kernel_name}({inputs_str})") # 合并结果 if len(branch_results) == 1: - code_lines.append(f" tensors['output'][:] = {branch_results[0]}") + code_lines.append(f" # 将结果转换回 numpy 并写入输出") + code_lines.append(f" tensors['output'][:] = {branch_results[0]}.numpy()") else: merged = branch_results[0] for i in range(1, len(branch_results)): new_merged = f"merged_{i}" code_lines.append(f" {new_merged} = {merged} + {branch_results[i]}") merged = new_merged - code_lines.append(f" tensors['output'][:] = {merged}") + code_lines.append(f" # 将结果转换回 numpy 并写入输出") + code_lines.append(f" tensors['output'][:] = {merged}.numpy()") elif orch_info["mode"] == "mixed": code_lines.append(f" # 混合执行模式") @@ -573,9 +645,9 @@ def _generate_test_class( result_var = f"parallel_{i}" branch_results.append(result_var) - inputs_str = ", ".join([f"tensors['{inp}']" for inp in kernel_inputs]) + inputs_str = ", ".join([f"torch_tensors['{inp}']" for inp in kernel_inputs]) # 调用嵌套函数不需要 self - code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") + code_lines.append(f" {result_var} = _torch_{kernel_name}({inputs_str})") # 合并并行结果 if len(branch_results) > 1: @@ -598,13 +670,14 @@ def _generate_test_class( # 第一个输入是变量,其他是张量 inputs_parts = [kernel_inputs[0]] for inp in kernel_inputs[1:]: - inputs_parts.append(f"tensors['{inp}']") + inputs_parts.append(f"torch_tensors['{inp}']") inputs_str = ", ".join(inputs_parts) # 调用嵌套函数不需要 self - code_lines.append(f" {result_var} = _numpy_{kernel_name}({inputs_str})") + code_lines.append(f" {result_var} = _torch_{kernel_name}({inputs_str})") current_result = result_var - code_lines.append(f" tensors['output'][:] = {current_result}") + code_lines.append(f" # 将结果转换回 numpy 并写入输出") + code_lines.append(f" tensors['output'][:] = {current_result}.numpy()") code_lines.append(f"") @@ -625,6 +698,7 @@ def generate_test_file( - mode: 组合模式 - shape: 张量形状 - num_ops_range: 操作数量范围 + - tensor_init_type: 张量初始化类型(可选) """ # 生成文件头 header = '''""" @@ -638,7 +712,7 @@ def generate_test_file( from pathlib import Path from typing import Any, List -import numpy as np +import torch import pytest from pto_test.core.test_case import DataType, PTOTestCase, TensorSpec @@ -662,6 +736,7 @@ def generate_test_file( shape=config.get("shape", (128, 128)), num_ops_range=config.get("num_ops_range", (3, 7)), input_shapes_list=config.get("input_shapes_list"), + tensor_init_type=config.get("tensor_init_type"), ) test_cases.append(test_code) diff --git a/src/pto_test/codegen/golden_generator.py b/src/pto_test/codegen/golden_generator.py index 1d8442b..dd0b84d 100644 --- a/src/pto_test/codegen/golden_generator.py +++ b/src/pto_test/codegen/golden_generator.py @@ -66,6 +66,7 @@ def generate(self, test_case: "PTOTestCase") -> str: '"""', "", "import numpy as np", + "import torch", "", f"__outputs__ = {output_names!r}", f"TENSOR_ORDER = {tensor_order!r}", @@ -274,6 +275,7 @@ def generate_with_callback( '"""', "", "import numpy as np", + "import torch", "", f"__outputs__ = {output_names!r}", f"TENSOR_ORDER = {tensor_order!r}",