From cb8df81b7de47a4de1e8d6be7fd35160c9ff390f Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 12 Mar 2026 17:28:17 +0800 Subject: [PATCH 1/8] Add standalone per-case golden scripts --- .github/workflows/ci.yml | 13 +- .../scripts/generate_testcase.py | 64 +- .../scripts/run_remote_npu_validation.sh | 49 +- test/samples/Abs/abs_compare.py | 737 ++++++++++++++++++ test/samples/Abs/abs_golden.py | 737 ++++++++++++++++++ test/samples/Addc/addc_compare.py | 737 ++++++++++++++++++ test/samples/Addc/addc_golden.py | 737 ++++++++++++++++++ test/samples/Adds/adds_compare.py | 737 ++++++++++++++++++ test/samples/Adds/adds_golden.py | 737 ++++++++++++++++++ test/samples/Addsc/addsc_compare.py | 737 ++++++++++++++++++ test/samples/Addsc/addsc_golden.py | 737 ++++++++++++++++++ test/samples/And/and_compare.py | 737 ++++++++++++++++++ test/samples/And/and_golden.py | 737 ++++++++++++++++++ test/samples/Ands/ands_compare.py | 737 ++++++++++++++++++ test/samples/Ands/ands_golden.py | 737 ++++++++++++++++++ test/samples/Cmp/cmp_compare.py | 737 ++++++++++++++++++ test/samples/Cmp/cmp_golden.py | 737 ++++++++++++++++++ test/samples/Cmps/cmps_compare.py | 737 ++++++++++++++++++ test/samples/Cmps/cmps_golden.py | 737 ++++++++++++++++++ test/samples/Colexpand/colexpand_compare.py | 737 ++++++++++++++++++ test/samples/Colexpand/colexpand_golden.py | 737 ++++++++++++++++++ test/samples/Colmax/colmax_compare.py | 737 ++++++++++++++++++ test/samples/Colmax/colmax_golden.py | 737 ++++++++++++++++++ test/samples/Colmin/colmin_compare.py | 737 ++++++++++++++++++ test/samples/Colmin/colmin_golden.py | 737 ++++++++++++++++++ test/samples/Colsum/colsum_compare.py | 737 ++++++++++++++++++ test/samples/Colsum/colsum_golden.py | 737 ++++++++++++++++++ test/samples/Div/div_compare.py | 737 ++++++++++++++++++ test/samples/Div/div_golden.py | 737 ++++++++++++++++++ test/samples/Divs/divs_compare.py | 737 ++++++++++++++++++ test/samples/Divs/divs_golden.py | 737 ++++++++++++++++++ test/samples/Divs2/divs2_compare.py | 737 ++++++++++++++++++ test/samples/Divs2/divs2_golden.py | 737 ++++++++++++++++++ test/samples/Exp/exp_compare.py | 737 ++++++++++++++++++ test/samples/Exp/exp_golden.py | 737 ++++++++++++++++++ test/samples/Expands/expand_compare.py | 737 ++++++++++++++++++ test/samples/Expands/expand_golden.py | 737 ++++++++++++++++++ test/samples/Expands/expands_compare.py | 737 ++++++++++++++++++ test/samples/Expands/expands_golden.py | 737 ++++++++++++++++++ test/samples/Log/log_compare.py | 737 ++++++++++++++++++ test/samples/Log/log_golden.py | 737 ++++++++++++++++++ test/samples/Lrelu/lrelu_compare.py | 737 ++++++++++++++++++ test/samples/Lrelu/lrelu_golden.py | 737 ++++++++++++++++++ test/samples/Max/max_compare.py | 737 ++++++++++++++++++ test/samples/Max/max_golden.py | 737 ++++++++++++++++++ test/samples/Maxs/maxs_compare.py | 737 ++++++++++++++++++ test/samples/Maxs/maxs_golden.py | 737 ++++++++++++++++++ test/samples/Min/min_compare.py | 737 ++++++++++++++++++ test/samples/Min/min_golden.py | 737 ++++++++++++++++++ test/samples/Mins/mins_compare.py | 737 ++++++++++++++++++ test/samples/Mins/mins_golden.py | 737 ++++++++++++++++++ test/samples/Mul/mul_compare.py | 737 ++++++++++++++++++ test/samples/Mul/mul_golden.py | 737 ++++++++++++++++++ test/samples/Muls/muls_compare.py | 737 ++++++++++++++++++ test/samples/Muls/muls_golden.py | 737 ++++++++++++++++++ test/samples/Neg/neg_compare.py | 737 ++++++++++++++++++ test/samples/Neg/neg_golden.py | 737 ++++++++++++++++++ test/samples/Not/not_compare.py | 737 ++++++++++++++++++ test/samples/Not/not_golden.py | 737 ++++++++++++++++++ test/samples/Or/or_compare.py | 737 ++++++++++++++++++ test/samples/Or/or_golden.py | 737 ++++++++++++++++++ test/samples/Ors/ors_compare.py | 737 ++++++++++++++++++ test/samples/Ors/ors_golden.py | 737 ++++++++++++++++++ test/samples/Partadd/partadd_compare.py | 737 ++++++++++++++++++ test/samples/Partadd/partadd_golden.py | 737 ++++++++++++++++++ test/samples/Partmax/partmax_compare.py | 737 ++++++++++++++++++ test/samples/Partmax/partmax_golden.py | 737 ++++++++++++++++++ test/samples/Partmin/partmin_compare.py | 737 ++++++++++++++++++ test/samples/Partmin/partmin_golden.py | 737 ++++++++++++++++++ test/samples/Prelu/prelu_compare.py | 737 ++++++++++++++++++ test/samples/Prelu/prelu_golden.py | 737 ++++++++++++++++++ test/samples/Recip/recip_compare.py | 737 ++++++++++++++++++ test/samples/Recip/recip_golden.py | 737 ++++++++++++++++++ test/samples/Relu/relu_compare.py | 737 ++++++++++++++++++ test/samples/Relu/relu_golden.py | 737 ++++++++++++++++++ test/samples/Rem/rem_compare.py | 737 ++++++++++++++++++ test/samples/Rem/rem_golden.py | 737 ++++++++++++++++++ test/samples/Rems/rems_compare.py | 737 ++++++++++++++++++ test/samples/Rems/rems_golden.py | 737 ++++++++++++++++++ test/samples/Rowexpand/rowexpand_compare.py | 737 ++++++++++++++++++ test/samples/Rowexpand/rowexpand_golden.py | 737 ++++++++++++++++++ .../Rowexpanddiv/rowexpanddiv_compare.py | 737 ++++++++++++++++++ .../Rowexpanddiv/rowexpanddiv_golden.py | 737 ++++++++++++++++++ .../Rowexpandmul/rowexpandmul_compare.py | 737 ++++++++++++++++++ .../Rowexpandmul/rowexpandmul_golden.py | 737 ++++++++++++++++++ .../Rowexpandsub/rowexpandsub_compare.py | 737 ++++++++++++++++++ .../Rowexpandsub/rowexpandsub_golden.py | 737 ++++++++++++++++++ test/samples/Rowmax/rowmax_compare.py | 737 ++++++++++++++++++ test/samples/Rowmax/rowmax_golden.py | 737 ++++++++++++++++++ test/samples/Rowmin/rowmin_compare.py | 737 ++++++++++++++++++ test/samples/Rowmin/rowmin_golden.py | 737 ++++++++++++++++++ test/samples/Rowsum/rowsum_compare.py | 737 ++++++++++++++++++ test/samples/Rowsum/rowsum_golden.py | 737 ++++++++++++++++++ test/samples/Rsqrt/rsqrt_compare.py | 737 ++++++++++++++++++ test/samples/Rsqrt/rsqrt_golden.py | 737 ++++++++++++++++++ test/samples/Sel/sel_compare.py | 737 ++++++++++++++++++ test/samples/Sel/sel_golden.py | 737 ++++++++++++++++++ test/samples/Sels/sels_compare.py | 737 ++++++++++++++++++ test/samples/Sels/sels_golden.py | 737 ++++++++++++++++++ test/samples/Shl/shl_compare.py | 737 ++++++++++++++++++ test/samples/Shl/shl_golden.py | 737 ++++++++++++++++++ test/samples/Shls/shls_compare.py | 737 ++++++++++++++++++ test/samples/Shls/shls_golden.py | 737 ++++++++++++++++++ test/samples/Shr/shr_compare.py | 737 ++++++++++++++++++ test/samples/Shr/shr_golden.py | 737 ++++++++++++++++++ test/samples/Shrs/shrs_compare.py | 737 ++++++++++++++++++ test/samples/Shrs/shrs_golden.py | 737 ++++++++++++++++++ test/samples/Sqrt/sqrt_compare.py | 737 ++++++++++++++++++ test/samples/Sqrt/sqrt_golden.py | 737 ++++++++++++++++++ test/samples/Sub/sub_compare.py | 737 ++++++++++++++++++ test/samples/Sub/sub_golden.py | 737 ++++++++++++++++++ test/samples/Subc/subc_compare.py | 737 ++++++++++++++++++ test/samples/Subc/subc_golden.py | 737 ++++++++++++++++++ test/samples/Subs/subs_compare.py | 737 ++++++++++++++++++ test/samples/Subs/subs_golden.py | 737 ++++++++++++++++++ test/samples/Subsc/subsc_compare.py | 737 ++++++++++++++++++ test/samples/Subsc/subsc_golden.py | 737 ++++++++++++++++++ .../VectorAddition/vadd_pto_ir_compare.py | 737 ++++++++++++++++++ .../VectorAddition/vadd_pto_ir_golden.py | 737 ++++++++++++++++++ .../VectorAddition/vectorAddition_compare.py | 737 ++++++++++++++++++ .../VectorAddition/vectorAddition_golden.py | 737 ++++++++++++++++++ test/samples/Xor/xor_compare.py | 737 ++++++++++++++++++ test/samples/Xor/xor_golden.py | 737 ++++++++++++++++++ test/samples/Xors/xors_compare.py | 737 ++++++++++++++++++ test/samples/Xors/xors_golden.py | 737 ++++++++++++++++++ 125 files changed, 90014 insertions(+), 26 deletions(-) create mode 100755 test/samples/Abs/abs_compare.py create mode 100755 test/samples/Abs/abs_golden.py create mode 100755 test/samples/Addc/addc_compare.py create mode 100755 test/samples/Addc/addc_golden.py create mode 100755 test/samples/Adds/adds_compare.py create mode 100755 test/samples/Adds/adds_golden.py create mode 100755 test/samples/Addsc/addsc_compare.py create mode 100755 test/samples/Addsc/addsc_golden.py create mode 100755 test/samples/And/and_compare.py create mode 100755 test/samples/And/and_golden.py create mode 100755 test/samples/Ands/ands_compare.py create mode 100755 test/samples/Ands/ands_golden.py create mode 100755 test/samples/Cmp/cmp_compare.py create mode 100755 test/samples/Cmp/cmp_golden.py create mode 100755 test/samples/Cmps/cmps_compare.py create mode 100755 test/samples/Cmps/cmps_golden.py create mode 100755 test/samples/Colexpand/colexpand_compare.py create mode 100755 test/samples/Colexpand/colexpand_golden.py create mode 100755 test/samples/Colmax/colmax_compare.py create mode 100755 test/samples/Colmax/colmax_golden.py create mode 100755 test/samples/Colmin/colmin_compare.py create mode 100755 test/samples/Colmin/colmin_golden.py create mode 100755 test/samples/Colsum/colsum_compare.py create mode 100755 test/samples/Colsum/colsum_golden.py create mode 100755 test/samples/Div/div_compare.py create mode 100755 test/samples/Div/div_golden.py create mode 100755 test/samples/Divs/divs_compare.py create mode 100755 test/samples/Divs/divs_golden.py create mode 100755 test/samples/Divs2/divs2_compare.py create mode 100755 test/samples/Divs2/divs2_golden.py create mode 100755 test/samples/Exp/exp_compare.py create mode 100755 test/samples/Exp/exp_golden.py create mode 100755 test/samples/Expands/expand_compare.py create mode 100755 test/samples/Expands/expand_golden.py create mode 100755 test/samples/Expands/expands_compare.py create mode 100755 test/samples/Expands/expands_golden.py create mode 100755 test/samples/Log/log_compare.py create mode 100755 test/samples/Log/log_golden.py create mode 100755 test/samples/Lrelu/lrelu_compare.py create mode 100755 test/samples/Lrelu/lrelu_golden.py create mode 100755 test/samples/Max/max_compare.py create mode 100755 test/samples/Max/max_golden.py create mode 100755 test/samples/Maxs/maxs_compare.py create mode 100755 test/samples/Maxs/maxs_golden.py create mode 100755 test/samples/Min/min_compare.py create mode 100755 test/samples/Min/min_golden.py create mode 100755 test/samples/Mins/mins_compare.py create mode 100755 test/samples/Mins/mins_golden.py create mode 100755 test/samples/Mul/mul_compare.py create mode 100755 test/samples/Mul/mul_golden.py create mode 100755 test/samples/Muls/muls_compare.py create mode 100755 test/samples/Muls/muls_golden.py create mode 100755 test/samples/Neg/neg_compare.py create mode 100755 test/samples/Neg/neg_golden.py create mode 100755 test/samples/Not/not_compare.py create mode 100755 test/samples/Not/not_golden.py create mode 100755 test/samples/Or/or_compare.py create mode 100755 test/samples/Or/or_golden.py create mode 100755 test/samples/Ors/ors_compare.py create mode 100755 test/samples/Ors/ors_golden.py create mode 100755 test/samples/Partadd/partadd_compare.py create mode 100755 test/samples/Partadd/partadd_golden.py create mode 100755 test/samples/Partmax/partmax_compare.py create mode 100755 test/samples/Partmax/partmax_golden.py create mode 100755 test/samples/Partmin/partmin_compare.py create mode 100755 test/samples/Partmin/partmin_golden.py create mode 100755 test/samples/Prelu/prelu_compare.py create mode 100755 test/samples/Prelu/prelu_golden.py create mode 100755 test/samples/Recip/recip_compare.py create mode 100755 test/samples/Recip/recip_golden.py create mode 100755 test/samples/Relu/relu_compare.py create mode 100755 test/samples/Relu/relu_golden.py create mode 100755 test/samples/Rem/rem_compare.py create mode 100755 test/samples/Rem/rem_golden.py create mode 100755 test/samples/Rems/rems_compare.py create mode 100755 test/samples/Rems/rems_golden.py create mode 100755 test/samples/Rowexpand/rowexpand_compare.py create mode 100755 test/samples/Rowexpand/rowexpand_golden.py create mode 100755 test/samples/Rowexpanddiv/rowexpanddiv_compare.py create mode 100755 test/samples/Rowexpanddiv/rowexpanddiv_golden.py create mode 100755 test/samples/Rowexpandmul/rowexpandmul_compare.py create mode 100755 test/samples/Rowexpandmul/rowexpandmul_golden.py create mode 100755 test/samples/Rowexpandsub/rowexpandsub_compare.py create mode 100755 test/samples/Rowexpandsub/rowexpandsub_golden.py create mode 100755 test/samples/Rowmax/rowmax_compare.py create mode 100755 test/samples/Rowmax/rowmax_golden.py create mode 100755 test/samples/Rowmin/rowmin_compare.py create mode 100755 test/samples/Rowmin/rowmin_golden.py create mode 100755 test/samples/Rowsum/rowsum_compare.py create mode 100755 test/samples/Rowsum/rowsum_golden.py create mode 100755 test/samples/Rsqrt/rsqrt_compare.py create mode 100755 test/samples/Rsqrt/rsqrt_golden.py create mode 100755 test/samples/Sel/sel_compare.py create mode 100755 test/samples/Sel/sel_golden.py create mode 100755 test/samples/Sels/sels_compare.py create mode 100755 test/samples/Sels/sels_golden.py create mode 100755 test/samples/Shl/shl_compare.py create mode 100755 test/samples/Shl/shl_golden.py create mode 100755 test/samples/Shls/shls_compare.py create mode 100755 test/samples/Shls/shls_golden.py create mode 100755 test/samples/Shr/shr_compare.py create mode 100755 test/samples/Shr/shr_golden.py create mode 100755 test/samples/Shrs/shrs_compare.py create mode 100755 test/samples/Shrs/shrs_golden.py create mode 100755 test/samples/Sqrt/sqrt_compare.py create mode 100755 test/samples/Sqrt/sqrt_golden.py create mode 100755 test/samples/Sub/sub_compare.py create mode 100755 test/samples/Sub/sub_golden.py create mode 100755 test/samples/Subc/subc_compare.py create mode 100755 test/samples/Subc/subc_golden.py create mode 100755 test/samples/Subs/subs_compare.py create mode 100755 test/samples/Subs/subs_golden.py create mode 100755 test/samples/Subsc/subsc_compare.py create mode 100755 test/samples/Subsc/subsc_golden.py create mode 100755 test/samples/VectorAddition/vadd_pto_ir_compare.py create mode 100755 test/samples/VectorAddition/vadd_pto_ir_golden.py create mode 100755 test/samples/VectorAddition/vectorAddition_compare.py create mode 100755 test/samples/VectorAddition/vectorAddition_golden.py create mode 100755 test/samples/Xor/xor_compare.py create mode 100755 test/samples/Xor/xor_golden.py create mode 100755 test/samples/Xors/xors_compare.py create mode 100755 test/samples/Xors/xors_golden.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 111847a1..f25e4156 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ on: skip_cases: description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)" type: string - default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print" + default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic" run_only_cases: description: "Comma/space separated testcase names to run (empty = run all)" type: string @@ -204,6 +204,15 @@ jobs: cp test/npu_validation/scripts/generate_testcase.py "${PAYLOAD_DIR}/test/npu_validation/scripts/" cp test/npu_validation/scripts/run_remote_npu_validation.sh "${PAYLOAD_DIR}/test/npu_validation/scripts/" cp test/npu_validation/templates/* "${PAYLOAD_DIR}/test/npu_validation/templates/" + while IFS= read -r -d '' file; do + dst="${PAYLOAD_DIR}/${file}" + mkdir -p "$(dirname "${dst}")" + cp "${file}" "${dst}" + done < <( + find test/samples \ + \( -path '*/npu_validation/*' -o -name '*_golden.py' -o -name '*_compare.py' \) \ + -type f -print0 + ) chmod +x "${PAYLOAD_DIR}/test/npu_validation/scripts/run_remote_npu_validation.sh" tar -czf "${PAYLOAD_TGZ}" -C "${PAYLOAD_DIR}" . @@ -238,7 +247,7 @@ jobs: # Temporary CI gate: skip cases that still error/flap on the remote NPU. # Update this list as we fix the underlying issues. DEFAULT_SKIP_CASES: >- - mix_kernel,vadd_validshape,vadd_validshape_dynamic,print + mix_kernel,vadd_validshape,vadd_validshape_dynamic steps: - name: Resolve validation parameters shell: bash diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 217a4eaf..481f7b81 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -4,6 +4,7 @@ import argparse import ast import re +import shutil from pathlib import Path from typing import Optional @@ -266,6 +267,33 @@ def _derive_testcase_name(input_cpp: Path) -> str: return name +def _resolve_sample_root(input_cpp: Path) -> Path: + parent = input_cpp.parent + if parent.name == "npu_validation": + return parent.parent + if parent.parent.name == "npu_validation": + return parent.parent.parent + return parent + + +def _find_custom_case_asset(sample_root: Path, testcase: str, filename: str) -> Optional[Path]: + candidates = ( + sample_root / f"{testcase}_{filename}", + sample_root / "npu_validation" / testcase / filename, + sample_root / "npu_validation" / filename, + ) + for candidate in candidates: + if candidate.is_file(): + return candidate + return None + + +def _copy_asset_if_needed(src: Path, dst: Path): + if src.resolve() == dst.resolve(): + return + shutil.copy2(src, dst) + + def _replace_includes(text: str) -> str: if "#include \"common/pto_instr.hpp\"" in text: return text.replace("#include \"common/pto_instr.hpp\"", INCLUDE_REPLACEMENT.rstrip()) @@ -814,13 +842,16 @@ def generate_testcase( soc_version: str, aicore_arch: Optional[str] = None, ): - sample_dir = input_cpp.parent + sample_root = _resolve_sample_root(input_cpp) if output_root: - output_dir = output_root / sample_dir.name / testcase + output_dir = output_root / sample_root.name / testcase else: - output_dir = sample_dir / "npu_validation" / testcase + output_dir = sample_root / "npu_validation" / testcase output_dir.mkdir(parents=True, exist_ok=True) + custom_golden = _find_custom_case_asset(sample_root, testcase, "golden.py") + custom_compare = _find_custom_case_asset(sample_root, testcase, "compare.py") + raw_kernel = input_cpp.read_text(encoding="utf-8") raw_kernel_for_analysis = raw_kernel # pto.tcmp / pto.tcmps produce packed predicate masks and leave parts of the @@ -1137,8 +1168,12 @@ def generate_testcase( input_generate.append(f" {name} = np.random.random(size=({size},)).astype({np_dtype})") input_generate.append(f" {name}.tofile(\"{name}.bin\")") - golden_py = golden_template.replace("@INPUT_GENERATE@", "\n".join(input_generate)) - (output_dir / "golden.py").write_text(golden_py, encoding="utf-8") + golden_dst = output_dir / "golden.py" + if custom_golden is not None: + _copy_asset_if_needed(custom_golden, golden_dst) + else: + golden_py = golden_template.replace("@INPUT_GENERATE@", "\n".join(input_generate)) + golden_dst.write_text(golden_py, encoding="utf-8") # Emit the kernel source, optionally injecting a packed-predicate preload to # make TCMP/TCMPS outputs deterministic for byte-wise compares. @@ -1381,8 +1416,23 @@ def generate_testcase( compare_lines.append( f" ok = compare_bin(\"golden_{name}.bin\", \"{name}.bin\", {np_dtype}, {eps}) and ok" ) - compare_py = compare_template.replace("@COMPARES@", "\n".join(compare_lines)) - (output_dir / "compare.py").write_text(compare_py, encoding="utf-8") + compare_dst = output_dir / "compare.py" + if custom_compare is not None: + _copy_asset_if_needed(custom_compare, compare_dst) + else: + compare_py = compare_template.replace("@COMPARES@", "\n".join(compare_lines)) + compare_dst.write_text(compare_py, encoding="utf-8") + + (output_dir / "validation_meta.env").write_text( + "\n".join( + [ + f"CUSTOM_GOLDEN={1 if custom_golden is not None else 0}", + f"CUSTOM_COMPARE={1 if custom_compare is not None else 0}", + "", + ] + ), + encoding="utf-8", + ) # Let the runner know which bins are outputs (for sim->golden copying). (output_dir / "outputs.txt").write_text( diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 43f766dd..8393452f 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -74,14 +74,7 @@ for f in "$HOME/.bash_profile" "$HOME/.bashrc"; do source_rc "$f" done -if [[ -f "/usr/local/Ascend/cann/set_env.sh" ]]; then - log "Sourcing /usr/local/Ascend/cann/set_env.sh" - set +e +u +o pipefail - # shellcheck disable=SC1091 - source "/usr/local/Ascend/cann/set_env.sh" || true - set -euo pipefail - set -o pipefail -elif [[ -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then +if [[ -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then log "Sourcing /usr/local/Ascend/ascend-toolkit/latest/set_env.sh" set +e +u +o pipefail # shellcheck disable=SC1091 @@ -101,7 +94,7 @@ command -v bisheng || true bisheng --version || true if [[ -z "${ASCEND_HOME_PATH:-}" ]]; then - for d in /usr/local/Ascend/cann /usr/local/Ascend/cann-* /usr/local/Ascend/ascend-toolkit/latest; do + for d in /usr/local/Ascend/ascend-toolkit/latest /usr/local/Ascend/cann-*; do [[ -d "$d" ]] || continue export ASCEND_HOME_PATH="$d" break @@ -233,6 +226,13 @@ while IFS= read -r -d '' cpp; do cd "${nv_dir}" export ACL_DEVICE_ID="${DEVICE_ID}" + CUSTOM_GOLDEN=0 + CUSTOM_COMPARE=0 + if [[ -f "./validation_meta.env" ]]; then + # shellcheck disable=SC1091 + source "./validation_meta.env" + fi + enable_sim_golden="OFF" [[ "${GOLDEN_MODE}" == "sim" ]] && enable_sim_golden="ON" cmake -S . -B ./build \ @@ -264,12 +264,23 @@ while IFS= read -r -d '' cpp; do case "${GOLDEN_MODE}" in sim) python3 ./golden.py - LD_LIBRARY_PATH="${LD_LIBRARY_PATH_SIM}" ./build/${testcase}_sim - copy_outputs_as_golden - if [[ "${RUN_MODE}" == "npu" ]]; then - LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} + if [[ "${CUSTOM_GOLDEN}" == "1" ]]; then + log "Using custom golden for ${testcase}" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_SIM}" ./build/${testcase}_sim + COMPARE_STRICT=1 python3 ./compare.py + if [[ "${RUN_MODE}" == "npu" ]]; then + python3 ./golden.py + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} + COMPARE_STRICT=1 python3 ./compare.py + fi + else + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_SIM}" ./build/${testcase}_sim + copy_outputs_as_golden + if [[ "${RUN_MODE}" == "npu" ]]; then + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} + fi + COMPARE_STRICT=1 python3 ./compare.py fi - COMPARE_STRICT=1 python3 ./compare.py ;; npu) if [[ "${RUN_MODE}" != "npu" ]]; then @@ -278,9 +289,13 @@ while IFS= read -r -d '' cpp; do fi python3 ./golden.py LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} - copy_outputs_as_golden - python3 ./golden.py - LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} + if [[ "${CUSTOM_GOLDEN}" != "1" ]]; then + copy_outputs_as_golden + python3 ./golden.py + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" ./build/${testcase} + else + log "Using custom golden for ${testcase}" + fi COMPARE_STRICT=1 python3 ./compare.py ;; skip) diff --git a/test/samples/Abs/abs_compare.py b/test/samples/Abs/abs_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Abs/abs_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Abs/abs_golden.py b/test/samples/Abs/abs_golden.py new file mode 100755 index 00000000..b26aa123 --- /dev/null +++ b/test/samples/Abs/abs_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("abs") diff --git a/test/samples/Addc/addc_compare.py b/test/samples/Addc/addc_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Addc/addc_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Addc/addc_golden.py b/test/samples/Addc/addc_golden.py new file mode 100755 index 00000000..a9dc8190 --- /dev/null +++ b/test/samples/Addc/addc_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_addc_case("addc") diff --git a/test/samples/Adds/adds_compare.py b/test/samples/Adds/adds_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Adds/adds_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Adds/adds_golden.py b/test/samples/Adds/adds_golden.py new file mode 100755 index 00000000..1937f3b2 --- /dev/null +++ b/test/samples/Adds/adds_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("adds", 3.14) diff --git a/test/samples/Addsc/addsc_compare.py b/test/samples/Addsc/addsc_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Addsc/addsc_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Addsc/addsc_golden.py b/test/samples/Addsc/addsc_golden.py new file mode 100755 index 00000000..1ca45a9e --- /dev/null +++ b/test/samples/Addsc/addsc_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_carry_case("addsc", 3.14) diff --git a/test/samples/And/and_compare.py b/test/samples/And/and_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/And/and_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/And/and_golden.py b/test/samples/And/and_golden.py new file mode 100755 index 00000000..93c5af66 --- /dev/null +++ b/test/samples/And/and_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("and", np.int16) diff --git a/test/samples/Ands/ands_compare.py b/test/samples/Ands/ands_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Ands/ands_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Ands/ands_golden.py b/test/samples/Ands/ands_golden.py new file mode 100755 index 00000000..4905e5b2 --- /dev/null +++ b/test/samples/Ands/ands_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_scalar_case("ands", 88, np.int16) diff --git a/test/samples/Cmp/cmp_compare.py b/test/samples/Cmp/cmp_compare.py new file mode 100755 index 00000000..00382a69 --- /dev/null +++ b/test/samples/Cmp/cmp_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_packed_mask_outputs() diff --git a/test/samples/Cmp/cmp_golden.py b/test/samples/Cmp/cmp_golden.py new file mode 100755 index 00000000..44fe0111 --- /dev/null +++ b/test/samples/Cmp/cmp_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_cmp_case("cmp") diff --git a/test/samples/Cmps/cmps_compare.py b/test/samples/Cmps/cmps_compare.py new file mode 100755 index 00000000..00382a69 --- /dev/null +++ b/test/samples/Cmps/cmps_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_packed_mask_outputs() diff --git a/test/samples/Cmps/cmps_golden.py b/test/samples/Cmps/cmps_golden.py new file mode 100755 index 00000000..208c4d8c --- /dev/null +++ b/test/samples/Cmps/cmps_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_cmp_case("cmps", scalar=1.0) diff --git a/test/samples/Colexpand/colexpand_compare.py b/test/samples/Colexpand/colexpand_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Colexpand/colexpand_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Colexpand/colexpand_golden.py b/test/samples/Colexpand/colexpand_golden.py new file mode 100755 index 00000000..f065cf74 --- /dev/null +++ b/test/samples/Colexpand/colexpand_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_colexpand_case() diff --git a/test/samples/Colmax/colmax_compare.py b/test/samples/Colmax/colmax_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Colmax/colmax_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Colmax/colmax_golden.py b/test/samples/Colmax/colmax_golden.py new file mode 100755 index 00000000..d1631bde --- /dev/null +++ b/test/samples/Colmax/colmax_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_col_reduce_case("colmax", accumulate=False) diff --git a/test/samples/Colmin/colmin_compare.py b/test/samples/Colmin/colmin_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Colmin/colmin_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Colmin/colmin_golden.py b/test/samples/Colmin/colmin_golden.py new file mode 100755 index 00000000..183ce333 --- /dev/null +++ b/test/samples/Colmin/colmin_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_col_reduce_case("colmin", accumulate=False) diff --git a/test/samples/Colsum/colsum_compare.py b/test/samples/Colsum/colsum_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Colsum/colsum_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Colsum/colsum_golden.py b/test/samples/Colsum/colsum_golden.py new file mode 100755 index 00000000..f50d62d7 --- /dev/null +++ b/test/samples/Colsum/colsum_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_col_reduce_case("colsum", accumulate=True) diff --git a/test/samples/Div/div_compare.py b/test/samples/Div/div_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Div/div_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Div/div_golden.py b/test/samples/Div/div_golden.py new file mode 100755 index 00000000..1f6f9f3b --- /dev/null +++ b/test/samples/Div/div_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("div") diff --git a/test/samples/Divs/divs_compare.py b/test/samples/Divs/divs_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Divs/divs_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Divs/divs_golden.py b/test/samples/Divs/divs_golden.py new file mode 100755 index 00000000..e6b841f1 --- /dev/null +++ b/test/samples/Divs/divs_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("divs", 3.14) diff --git a/test/samples/Divs2/divs2_compare.py b/test/samples/Divs2/divs2_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Divs2/divs2_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Divs2/divs2_golden.py b/test/samples/Divs2/divs2_golden.py new file mode 100755 index 00000000..13242400 --- /dev/null +++ b/test/samples/Divs2/divs2_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("divs", 3.14, scalar_left=True) diff --git a/test/samples/Exp/exp_compare.py b/test/samples/Exp/exp_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Exp/exp_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Exp/exp_golden.py b/test/samples/Exp/exp_golden.py new file mode 100755 index 00000000..ed2a1dc1 --- /dev/null +++ b/test/samples/Exp/exp_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("exp") diff --git a/test/samples/Expands/expand_compare.py b/test/samples/Expands/expand_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Expands/expand_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Expands/expand_golden.py b/test/samples/Expands/expand_golden.py new file mode 100755 index 00000000..b5d7626c --- /dev/null +++ b/test/samples/Expands/expand_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_expands_case(3.14) diff --git a/test/samples/Expands/expands_compare.py b/test/samples/Expands/expands_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Expands/expands_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Expands/expands_golden.py b/test/samples/Expands/expands_golden.py new file mode 100755 index 00000000..b5d7626c --- /dev/null +++ b/test/samples/Expands/expands_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_expands_case(3.14) diff --git a/test/samples/Log/log_compare.py b/test/samples/Log/log_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Log/log_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Log/log_golden.py b/test/samples/Log/log_golden.py new file mode 100755 index 00000000..ae9746a6 --- /dev/null +++ b/test/samples/Log/log_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("log") diff --git a/test/samples/Lrelu/lrelu_compare.py b/test/samples/Lrelu/lrelu_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Lrelu/lrelu_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Lrelu/lrelu_golden.py b/test/samples/Lrelu/lrelu_golden.py new file mode 100755 index 00000000..1be0c688 --- /dev/null +++ b/test/samples/Lrelu/lrelu_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("lrelu", 3.14) diff --git a/test/samples/Max/max_compare.py b/test/samples/Max/max_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Max/max_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Max/max_golden.py b/test/samples/Max/max_golden.py new file mode 100755 index 00000000..41cf4f08 --- /dev/null +++ b/test/samples/Max/max_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("max") diff --git a/test/samples/Maxs/maxs_compare.py b/test/samples/Maxs/maxs_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Maxs/maxs_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Maxs/maxs_golden.py b/test/samples/Maxs/maxs_golden.py new file mode 100755 index 00000000..240b317f --- /dev/null +++ b/test/samples/Maxs/maxs_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("maxs", 3.14) diff --git a/test/samples/Min/min_compare.py b/test/samples/Min/min_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Min/min_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Min/min_golden.py b/test/samples/Min/min_golden.py new file mode 100755 index 00000000..5da825df --- /dev/null +++ b/test/samples/Min/min_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("min") diff --git a/test/samples/Mins/mins_compare.py b/test/samples/Mins/mins_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Mins/mins_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Mins/mins_golden.py b/test/samples/Mins/mins_golden.py new file mode 100755 index 00000000..5d1a7293 --- /dev/null +++ b/test/samples/Mins/mins_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("mins", 3.14) diff --git a/test/samples/Mul/mul_compare.py b/test/samples/Mul/mul_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Mul/mul_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Mul/mul_golden.py b/test/samples/Mul/mul_golden.py new file mode 100755 index 00000000..5af50d07 --- /dev/null +++ b/test/samples/Mul/mul_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("mul") diff --git a/test/samples/Muls/muls_compare.py b/test/samples/Muls/muls_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Muls/muls_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Muls/muls_golden.py b/test/samples/Muls/muls_golden.py new file mode 100755 index 00000000..bc3212a8 --- /dev/null +++ b/test/samples/Muls/muls_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("muls", 3.14) diff --git a/test/samples/Neg/neg_compare.py b/test/samples/Neg/neg_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Neg/neg_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Neg/neg_golden.py b/test/samples/Neg/neg_golden.py new file mode 100755 index 00000000..d1088c95 --- /dev/null +++ b/test/samples/Neg/neg_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("neg") diff --git a/test/samples/Not/not_compare.py b/test/samples/Not/not_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Not/not_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Not/not_golden.py b/test/samples/Not/not_golden.py new file mode 100755 index 00000000..805f4e31 --- /dev/null +++ b/test/samples/Not/not_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("not", np.int16) diff --git a/test/samples/Or/or_compare.py b/test/samples/Or/or_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Or/or_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Or/or_golden.py b/test/samples/Or/or_golden.py new file mode 100755 index 00000000..257b3900 --- /dev/null +++ b/test/samples/Or/or_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("or", np.int16) diff --git a/test/samples/Ors/ors_compare.py b/test/samples/Ors/ors_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Ors/ors_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Ors/ors_golden.py b/test/samples/Ors/ors_golden.py new file mode 100755 index 00000000..9ba60243 --- /dev/null +++ b/test/samples/Ors/ors_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_scalar_case("ors", 88, np.int16) diff --git a/test/samples/Partadd/partadd_compare.py b/test/samples/Partadd/partadd_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Partadd/partadd_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Partadd/partadd_golden.py b/test/samples/Partadd/partadd_golden.py new file mode 100755 index 00000000..b193bfff --- /dev/null +++ b/test/samples/Partadd/partadd_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("add") diff --git a/test/samples/Partmax/partmax_compare.py b/test/samples/Partmax/partmax_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Partmax/partmax_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Partmax/partmax_golden.py b/test/samples/Partmax/partmax_golden.py new file mode 100755 index 00000000..41cf4f08 --- /dev/null +++ b/test/samples/Partmax/partmax_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("max") diff --git a/test/samples/Partmin/partmin_compare.py b/test/samples/Partmin/partmin_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Partmin/partmin_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Partmin/partmin_golden.py b/test/samples/Partmin/partmin_golden.py new file mode 100755 index 00000000..5da825df --- /dev/null +++ b/test/samples/Partmin/partmin_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("min") diff --git a/test/samples/Prelu/prelu_compare.py b/test/samples/Prelu/prelu_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Prelu/prelu_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Prelu/prelu_golden.py b/test/samples/Prelu/prelu_golden.py new file mode 100755 index 00000000..84dd74ff --- /dev/null +++ b/test/samples/Prelu/prelu_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_prelu_case() diff --git a/test/samples/Recip/recip_compare.py b/test/samples/Recip/recip_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Recip/recip_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Recip/recip_golden.py b/test/samples/Recip/recip_golden.py new file mode 100755 index 00000000..dbc359d1 --- /dev/null +++ b/test/samples/Recip/recip_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("recip") diff --git a/test/samples/Relu/relu_compare.py b/test/samples/Relu/relu_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Relu/relu_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Relu/relu_golden.py b/test/samples/Relu/relu_golden.py new file mode 100755 index 00000000..3e014456 --- /dev/null +++ b/test/samples/Relu/relu_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("relu") diff --git a/test/samples/Rem/rem_compare.py b/test/samples/Rem/rem_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Rem/rem_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Rem/rem_golden.py b/test/samples/Rem/rem_golden.py new file mode 100755 index 00000000..1e439e88 --- /dev/null +++ b/test/samples/Rem/rem_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("rem") diff --git a/test/samples/Rems/rems_compare.py b/test/samples/Rems/rems_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Rems/rems_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Rems/rems_golden.py b/test/samples/Rems/rems_golden.py new file mode 100755 index 00000000..966abd2d --- /dev/null +++ b/test/samples/Rems/rems_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("rems", 3.14) diff --git a/test/samples/Rowexpand/rowexpand_compare.py b/test/samples/Rowexpand/rowexpand_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Rowexpand/rowexpand_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Rowexpand/rowexpand_golden.py b/test/samples/Rowexpand/rowexpand_golden.py new file mode 100755 index 00000000..38a11172 --- /dev/null +++ b/test/samples/Rowexpand/rowexpand_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_rowexpand_case() diff --git a/test/samples/Rowexpanddiv/rowexpanddiv_compare.py b/test/samples/Rowexpanddiv/rowexpanddiv_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Rowexpanddiv/rowexpanddiv_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Rowexpanddiv/rowexpanddiv_golden.py b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py new file mode 100755 index 00000000..9810472b --- /dev/null +++ b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_rowexpand_bin_case("rowexpanddiv") diff --git a/test/samples/Rowexpandmul/rowexpandmul_compare.py b/test/samples/Rowexpandmul/rowexpandmul_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Rowexpandmul/rowexpandmul_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Rowexpandmul/rowexpandmul_golden.py b/test/samples/Rowexpandmul/rowexpandmul_golden.py new file mode 100755 index 00000000..4611be12 --- /dev/null +++ b/test/samples/Rowexpandmul/rowexpandmul_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_rowexpand_bin_case("rowexpandmul") diff --git a/test/samples/Rowexpandsub/rowexpandsub_compare.py b/test/samples/Rowexpandsub/rowexpandsub_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Rowexpandsub/rowexpandsub_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Rowexpandsub/rowexpandsub_golden.py b/test/samples/Rowexpandsub/rowexpandsub_golden.py new file mode 100755 index 00000000..0016d52b --- /dev/null +++ b/test/samples/Rowexpandsub/rowexpandsub_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_rowexpand_bin_case("rowexpandsub") diff --git a/test/samples/Rowmax/rowmax_compare.py b/test/samples/Rowmax/rowmax_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Rowmax/rowmax_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Rowmax/rowmax_golden.py b/test/samples/Rowmax/rowmax_golden.py new file mode 100755 index 00000000..71a790f9 --- /dev/null +++ b/test/samples/Rowmax/rowmax_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_row_reduce_case("rowmax") diff --git a/test/samples/Rowmin/rowmin_compare.py b/test/samples/Rowmin/rowmin_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Rowmin/rowmin_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Rowmin/rowmin_golden.py b/test/samples/Rowmin/rowmin_golden.py new file mode 100755 index 00000000..d5d5ed08 --- /dev/null +++ b/test/samples/Rowmin/rowmin_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_row_reduce_case("rowmin") diff --git a/test/samples/Rowsum/rowsum_compare.py b/test/samples/Rowsum/rowsum_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Rowsum/rowsum_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Rowsum/rowsum_golden.py b/test/samples/Rowsum/rowsum_golden.py new file mode 100755 index 00000000..bddeaf31 --- /dev/null +++ b/test/samples/Rowsum/rowsum_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_row_reduce_case("rowsum") diff --git a/test/samples/Rsqrt/rsqrt_compare.py b/test/samples/Rsqrt/rsqrt_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Rsqrt/rsqrt_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Rsqrt/rsqrt_golden.py b/test/samples/Rsqrt/rsqrt_golden.py new file mode 100755 index 00000000..9959d8ef --- /dev/null +++ b/test/samples/Rsqrt/rsqrt_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("rsqrt") diff --git a/test/samples/Sel/sel_compare.py b/test/samples/Sel/sel_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Sel/sel_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Sel/sel_golden.py b/test/samples/Sel/sel_golden.py new file mode 100755 index 00000000..bb714af8 --- /dev/null +++ b/test/samples/Sel/sel_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_sel_case() diff --git a/test/samples/Sels/sels_compare.py b/test/samples/Sels/sels_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Sels/sels_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Sels/sels_golden.py b/test/samples/Sels/sels_golden.py new file mode 100755 index 00000000..37417eb0 --- /dev/null +++ b/test/samples/Sels/sels_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_sels_case(64) diff --git a/test/samples/Shl/shl_compare.py b/test/samples/Shl/shl_compare.py new file mode 100755 index 00000000..165980ed --- /dev/null +++ b/test/samples/Shl/shl_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int32, 0.0) diff --git a/test/samples/Shl/shl_golden.py b/test/samples/Shl/shl_golden.py new file mode 100755 index 00000000..be681c3a --- /dev/null +++ b/test/samples/Shl/shl_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("shl", np.int32) diff --git a/test/samples/Shls/shls_compare.py b/test/samples/Shls/shls_compare.py new file mode 100755 index 00000000..165980ed --- /dev/null +++ b/test/samples/Shls/shls_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int32, 0.0) diff --git a/test/samples/Shls/shls_golden.py b/test/samples/Shls/shls_golden.py new file mode 100755 index 00000000..020268aa --- /dev/null +++ b/test/samples/Shls/shls_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_scalar_case("shls", 2, np.int32) diff --git a/test/samples/Shr/shr_compare.py b/test/samples/Shr/shr_compare.py new file mode 100755 index 00000000..165980ed --- /dev/null +++ b/test/samples/Shr/shr_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int32, 0.0) diff --git a/test/samples/Shr/shr_golden.py b/test/samples/Shr/shr_golden.py new file mode 100755 index 00000000..fdadc6b5 --- /dev/null +++ b/test/samples/Shr/shr_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("shr", np.int32) diff --git a/test/samples/Shrs/shrs_compare.py b/test/samples/Shrs/shrs_compare.py new file mode 100755 index 00000000..165980ed --- /dev/null +++ b/test/samples/Shrs/shrs_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int32, 0.0) diff --git a/test/samples/Shrs/shrs_golden.py b/test/samples/Shrs/shrs_golden.py new file mode 100755 index 00000000..91c87797 --- /dev/null +++ b/test/samples/Shrs/shrs_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_scalar_case("shrs", 2, np.int32) diff --git a/test/samples/Sqrt/sqrt_compare.py b/test/samples/Sqrt/sqrt_compare.py new file mode 100755 index 00000000..081d562c --- /dev/null +++ b/test/samples/Sqrt/sqrt_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-3) diff --git a/test/samples/Sqrt/sqrt_golden.py b/test/samples/Sqrt/sqrt_golden.py new file mode 100755 index 00000000..479de04b --- /dev/null +++ b/test/samples/Sqrt/sqrt_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_unary_float_case("sqrt") diff --git a/test/samples/Sub/sub_compare.py b/test/samples/Sub/sub_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Sub/sub_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Sub/sub_golden.py b/test/samples/Sub/sub_golden.py new file mode 100755 index 00000000..6fb865db --- /dev/null +++ b/test/samples/Sub/sub_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("sub") diff --git a/test/samples/Subc/subc_compare.py b/test/samples/Subc/subc_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Subc/subc_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Subc/subc_golden.py b/test/samples/Subc/subc_golden.py new file mode 100755 index 00000000..fcb6a7c5 --- /dev/null +++ b/test/samples/Subc/subc_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_addc_case("subc") diff --git a/test/samples/Subs/subs_compare.py b/test/samples/Subs/subs_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Subs/subs_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Subs/subs_golden.py b/test/samples/Subs/subs_golden.py new file mode 100755 index 00000000..3c8ddaab --- /dev/null +++ b/test/samples/Subs/subs_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_float_case("subs", 3.14) diff --git a/test/samples/Subsc/subsc_compare.py b/test/samples/Subsc/subsc_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/Subsc/subsc_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/Subsc/subsc_golden.py b/test/samples/Subsc/subsc_golden.py new file mode 100755 index 00000000..7dca2d92 --- /dev/null +++ b/test/samples/Subsc/subsc_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_scalar_carry_case("subsc", 3.14) diff --git a/test/samples/VectorAddition/vadd_pto_ir_compare.py b/test/samples/VectorAddition/vadd_pto_ir_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/VectorAddition/vadd_pto_ir_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/VectorAddition/vadd_pto_ir_golden.py b/test/samples/VectorAddition/vadd_pto_ir_golden.py new file mode 100755 index 00000000..b193bfff --- /dev/null +++ b/test/samples/VectorAddition/vadd_pto_ir_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("add") diff --git a/test/samples/VectorAddition/vectorAddition_compare.py b/test/samples/VectorAddition/vectorAddition_compare.py new file mode 100755 index 00000000..891703ea --- /dev/null +++ b/test/samples/VectorAddition/vectorAddition_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.float32, 1e-4) diff --git a/test/samples/VectorAddition/vectorAddition_golden.py b/test/samples/VectorAddition/vectorAddition_golden.py new file mode 100755 index 00000000..b193bfff --- /dev/null +++ b/test/samples/VectorAddition/vectorAddition_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_binary_float_case("add") diff --git a/test/samples/Xor/xor_compare.py b/test/samples/Xor/xor_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Xor/xor_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Xor/xor_golden.py b/test/samples/Xor/xor_golden.py new file mode 100755 index 00000000..a5786585 --- /dev/null +++ b/test/samples/Xor/xor_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_self_case("xor", np.int16) diff --git a/test/samples/Xors/xors_compare.py b/test/samples/Xors/xors_compare.py new file mode 100755 index 00000000..780b65b1 --- /dev/null +++ b/test/samples/Xors/xors_compare.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + compare_all_outputs(np.int16, 0.0) diff --git a/test/samples/Xors/xors_golden.py b/test/samples/Xors/xors_golden.py new file mode 100755 index 00000000..91e32d92 --- /dev/null +++ b/test/samples/Xors/xors_golden.py @@ -0,0 +1,737 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def _host_type_to_np(host_type: str) -> np.dtype: + host_type = host_type.strip() + if host_type not in _HOST_TYPE_TO_NP: + raise KeyError(f"unsupported host type: {host_type}") + return np.dtype(_HOST_TYPE_TO_NP[host_type]) + + +def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: + text = Path(main_cpp).read_text(encoding="utf-8") + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) + } + np_types = { + match.group(1): _host_type_to_np(match.group(2)) + for match in re.finditer( + r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + if Path(outputs_txt).is_file(): + outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] + else: + outputs = [] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def _rng(): + return np.random.default_rng(SEED) + + +def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(arr).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f"expected {expected} elements, got {flat.size}") + return flat.reshape(rows, cols) + + +def _float_values(rng, count: int, *, style: str) -> np.ndarray: + if style == "signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == "signed_small": + arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == "nonzero_signed": + arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(arr) < np.float32(0.25) + arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == "positive": + arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style == "exp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + elif style == "cmp": + arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f"unsupported float style: {style}") + return arr + + +def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int16 style: {style}") + elif dtype == np.dtype(np.int32): + if style == "bitwise": + vals = rng.integers(-256, 256, size=count, dtype=np.int32) + elif style == "shift_small": + vals = rng.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f"unsupported int32 style: {style}") + else: + raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") + return vals.astype(dtype, copy=False) + + +def _packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError("mask bits must be a 2D array") + rows, cols = bits.shape + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + out = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_idx in range(width): + if bits[row, base_col + bit_idx]: + word |= 1 << bit_idx + out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) + return out.reshape(-1) + + +def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + buf = np.asarray(buf, dtype=np.uint8).reshape(-1) + if rows <= 0 or cols <= 0: + raise ValueError("rows/cols must be positive") + if buf.size % rows != 0: + raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") + storage_cols = buf.size // rows + row_bytes = _packed_row_bytes(cols) + if storage_cols < row_bytes: + raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") + packed = buf.reshape(rows, storage_cols) + bits = np.zeros((rows, cols), dtype=np.bool_) + for row in range(rows): + for word_idx, base_col in enumerate(range(0, cols, 64)): + word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") + width = min(64, cols - base_col) + for bit_idx in range(width): + bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 + return bits + + +def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: + return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) + + +def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: + return {name: _zero_buffer(meta, name) for name in meta.read_order} + + +def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f"missing buffer for {name}") + arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") + arr.tofile(f"{name}.bin") + + +def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f"missing golden for {name}") + arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if arr.size != expected: + raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") + arr.tofile(f"golden_{name}.bin") + + +def _single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f"expected exactly one output, got {meta.outputs}") + return meta.outputs[0] + + +def generate_binary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + lhs_name, rhs_name = meta.inputs + lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") + rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" + rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) + buffers = _default_buffers(meta) + buffers[lhs_name] = lhs + buffers[rhs_name] = rhs + _write_buffers(meta, buffers) + + if op == "add": + out = lhs + rhs + elif op == "sub": + out = lhs - rhs + elif op == "mul": + out = lhs * rhs + elif op == "div": + out = lhs / rhs + elif op == "max": + out = np.maximum(lhs, rhs) + elif op == "min": + out = np.minimum(lhs, rhs) + elif op == "rem": + out = np.fmod(lhs, rhs) + else: + raise ValueError(f"unsupported binary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" + if op == "exp": + style = "exp" + if op == "cmps": + style = "cmp" + if op in {"divs", "rems"}: + style = "signed" + src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") + if op in {"divs", "rems"}: + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + if op in {"log", "sqrt", "rsqrt", "recip"}: + src = _float_values(rng, meta.elem_counts[src_name], style="positive") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "adds": + out = src + np.float32(scalar) + elif op == "subs": + out = src - np.float32(scalar) + elif op == "muls": + out = src * np.float32(scalar) + elif op == "divs": + out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) + elif op == "maxs": + out = np.maximum(src, np.float32(scalar)) + elif op == "mins": + out = np.minimum(src, np.float32(scalar)) + elif op == "rems": + out = np.fmod(src, np.float32(scalar)) + elif op == "lrelu": + out = np.where(src > 0.0, src, src * np.float32(scalar)) + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + else: + raise ValueError(f"unsupported scalar/unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_unary_float_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "signed" + if op in {"exp"}: + style = "exp" + elif op in {"log", "sqrt", "rsqrt", "recip"}: + style = "positive" + src = _float_values(rng, meta.elem_counts[src_name], style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + + if op == "abs": + out = np.abs(src) + elif op == "neg": + out = -src + elif op == "exp": + out = np.exp(src) + elif op == "log": + out = np.log(src) + elif op == "sqrt": + out = np.sqrt(src) + elif op == "rsqrt": + out = 1.0 / np.sqrt(src) + elif op == "recip": + out = 1.0 / src + elif op == "relu": + out = np.maximum(src, np.float32(0.0)) + else: + raise ValueError(f"unsupported unary float op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_prelu_case(): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src_name, slope_name = meta.inputs + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[src_name] = src + buffers[slope_name] = slope + _write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * slope) + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_addc_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") + rng = _rng() + a_name, b_name, c_name = meta.inputs + a = _float_values(rng, meta.elem_counts[a_name], style="signed") + b = _float_values(rng, meta.elem_counts[b_name], style="signed") + c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") + buffers = _default_buffers(meta) + buffers[a_name] = a + buffers[b_name] = b + buffers[c_name] = c + _write_buffers(meta, buffers) + if op == "addc": + out = a + b + c + elif op == "subc": + out = a - b + c + else: + raise ValueError(f"unsupported carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_scalar_carry_case(op: str, scalar: float): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "addsc": + out = src + np.float32(scalar) + src + elif op == "subsc": + out = src - np.float32(scalar) + src + else: + raise ValueError(f"unsupported scalar carry op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_row_reduce_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "rowsum": + out = src_m.sum(axis=1, dtype=np.float32) + elif op == "rowmax": + out = src_m.max(axis=1) + elif op == "rowmin": + out = src_m.min(axis=1) + else: + raise ValueError(f"unsupported row reduction op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_col_reduce_case(op: str, *, accumulate: bool = False): + meta = load_case_meta() + if op == "colsum": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") + src_name, tmp_name = meta.inputs + else: + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src_name = meta.inputs[0] + tmp_name = None + rng = _rng() + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + if tmp_name is not None: + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + out_name = _single_output(meta) + out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) + if accumulate: + out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") + buffers[out_name] = out_init + _write_buffers(meta, buffers) + if op == "colsum": + out = src_m.sum(axis=0, dtype=np.float32) + if accumulate: + out = out + out_init + elif op == "colmax": + out = src_m.max(axis=0) + elif op == "colmin": + out = src_m.min(axis=0) + else: + raise ValueError(f"unsupported col reduction op: {op}") + _write_golden(meta, {out_name: out.astype(np.float32)}) + + +def generate_rowexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], COLS, axis=1) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_colexpand_case(): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + src = _float_values(rng, meta.elem_counts[src_name], style="signed") + src_m = _as_matrix(src) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], ROWS, axis=0) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_rowexpand_bin_case(op: str): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") + src0_m = _as_matrix(src0) + src1_m = _as_matrix(src1) + row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + if op == "rowexpandmul": + out = src0_m * row_scalars[:, None] + elif op == "rowexpanddiv": + out = src0_m / row_scalars[:, None] + elif op == "rowexpandsub": + out = src0_m - row_scalars[:, None] + else: + raise ValueError(f"unsupported rowexpand binary op: {op}") + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_expands_case(scalar: float): + meta = load_case_meta() + buffers = _default_buffers(meta) + _write_buffers(meta, buffers) + out_name = _single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) + _write_golden(meta, {out_name: out}) + + +def generate_cmp_case(op: str, *, scalar: float = 0.0): + meta = load_case_meta() + rng = _rng() + if op == "cmp": + if len(meta.inputs) != 2: + raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") + pred = _as_matrix(src0) < _as_matrix(src1) + elif op == "cmps": + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + src0_name = meta.inputs[0] + src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") + src1_name = None + src1 = None + pred = _as_matrix(src0) > np.float32(scalar) + else: + raise ValueError(f"unsupported compare op: {op}") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + if src1 is not None and src1_name is not None: + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out_name = _single_output(meta) + if meta.elem_counts[out_name] % ROWS != 0: + raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + storage_cols = meta.elem_counts[out_name] // ROWS + packed = pack_predicate_mask(pred, storage_cols=storage_cols) + _write_golden(meta, {out_name: packed}) + + +def generate_sel_case(): + meta = load_case_meta() + if len(meta.inputs) != 3: + raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") + rng = _rng() + mask_name, src0_name, src1_name = meta.inputs + storage_cols = meta.elem_counts[mask_name] // ROWS + mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[mask_name] = mask + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) + _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) + + +def generate_sels_case(select_mode: int): + meta = load_case_meta() + if len(meta.inputs) != 2: + raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") + rng = _rng() + src0_name, src1_name = meta.inputs + src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") + src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") + buffers = _default_buffers(meta) + buffers[src0_name] = src0 + buffers[src1_name] = src1 + _write_buffers(meta, buffers) + out = src0 if int(select_mode) == 1 else src1 + _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) + + +def generate_bitwise_self_case(op: str, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shl", "shr"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + if op == "and": + out = np.bitwise_and(src, src) + elif op == "or": + out = np.bitwise_or(src, src) + elif op == "xor": + out = np.bitwise_xor(src, src) + elif op == "shl": + out = np.left_shift(src, src) + elif op == "shr": + out = np.right_shift(src, src) + elif op == "not": + out = np.bitwise_not(src) + else: + raise ValueError(f"unsupported bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): + meta = load_case_meta() + if len(meta.inputs) != 1: + raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") + rng = _rng() + src_name = meta.inputs[0] + style = "shift_small" if op in {"shls", "shrs"} else "bitwise" + src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) + buffers = _default_buffers(meta) + buffers[src_name] = src + _write_buffers(meta, buffers) + scalar = np.asarray(scalar, dtype=dtype).item() + if op == "ands": + out = np.bitwise_and(src, scalar) + elif op == "ors": + out = np.bitwise_or(src, scalar) + elif op == "xors": + out = np.bitwise_xor(src, scalar) + elif op == "shls": + out = np.left_shift(src, scalar) + elif op == "shrs": + out = np.right_shift(src, scalar) + else: + raise ValueError(f"unsupported scalar bitwise op: {op}") + _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + abs_diff = np.abs(golden_cmp - output_cmp) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + need = int(rows) * int(cols) + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + row_bytes = min(_packed_row_bytes(cols), cols) + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def compare_all_outputs(dtype, eps): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok + return finalize_compare(ok) + + +def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok + return finalize_compare(ok) + + +def finalize_compare(ok: bool): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return False + print("[INFO] compare passed") + return True + +if __name__ == "__main__": + generate_bitwise_scalar_case("xors", 88, np.int16) From 5b4d0540d9dc95cb6dc85312ce40bcf456f17578 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 12 Mar 2026 17:35:02 +0800 Subject: [PATCH 2/8] Restore remote validation defaults --- .github/workflows/ci.yml | 4 ++-- .../scripts/run_remote_npu_validation.sh | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f25e4156..5e527a13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ on: skip_cases: description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)" type: string - default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic" + default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print" run_only_cases: description: "Comma/space separated testcase names to run (empty = run all)" type: string @@ -247,7 +247,7 @@ jobs: # Temporary CI gate: skip cases that still error/flap on the remote NPU. # Update this list as we fix the underlying issues. DEFAULT_SKIP_CASES: >- - mix_kernel,vadd_validshape,vadd_validshape_dynamic + mix_kernel,vadd_validshape,vadd_validshape_dynamic,print steps: - name: Resolve validation parameters shell: bash diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 8393452f..20000114 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -74,7 +74,14 @@ for f in "$HOME/.bash_profile" "$HOME/.bashrc"; do source_rc "$f" done -if [[ -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then +if [[ -f "/usr/local/Ascend/cann/set_env.sh" ]]; then + log "Sourcing /usr/local/Ascend/cann/set_env.sh" + set +e +u +o pipefail + # shellcheck disable=SC1091 + source "/usr/local/Ascend/cann/set_env.sh" || true + set -euo pipefail + set -o pipefail +elif [[ -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then log "Sourcing /usr/local/Ascend/ascend-toolkit/latest/set_env.sh" set +e +u +o pipefail # shellcheck disable=SC1091 @@ -94,7 +101,7 @@ command -v bisheng || true bisheng --version || true if [[ -z "${ASCEND_HOME_PATH:-}" ]]; then - for d in /usr/local/Ascend/ascend-toolkit/latest /usr/local/Ascend/cann-*; do + for d in /usr/local/Ascend/cann /usr/local/Ascend/cann-* /usr/local/Ascend/ascend-toolkit/latest; do [[ -d "$d" ]] || continue export ASCEND_HOME_PATH="$d" break From 4cd76828e21b01db3d47c1cebec6f37087410d70 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 12 Mar 2026 18:52:51 +0800 Subject: [PATCH 3/8] Rewrite sample validation scripts by hand --- .github/workflows/ci.yml | 2 +- .../scripts/generate_testcase.py | 3 + test/samples/Abs/abs_compare.py | 741 +---------------- test/samples/Abs/abs_golden.py | 744 +---------------- test/samples/Addc/addc_compare.py | 741 +---------------- test/samples/Addc/addc_golden.py | 742 +---------------- test/samples/Adds/adds_compare.py | 741 +---------------- test/samples/Adds/adds_golden.py | 744 +---------------- test/samples/Addsc/addsc_compare.py | 741 +---------------- test/samples/Addsc/addsc_golden.py | 744 +---------------- test/samples/And/and_compare.py | 741 +---------------- test/samples/And/and_golden.py | 744 +---------------- test/samples/Ands/ands_compare.py | 741 +---------------- test/samples/Ands/ands_golden.py | 744 +---------------- test/samples/Cmp/cmp_compare.py | 740 +---------------- test/samples/Cmp/cmp_golden.py | 741 +---------------- test/samples/Cmps/cmps_compare.py | 740 +---------------- test/samples/Cmps/cmps_golden.py | 745 +---------------- test/samples/Colexpand/colexpand_compare.py | 741 +---------------- test/samples/Colexpand/colexpand_golden.py | 745 +---------------- test/samples/Colmax/colmax_compare.py | 741 +---------------- test/samples/Colmax/colmax_golden.py | 753 +---------------- test/samples/Colmin/colmin_compare.py | 741 +---------------- test/samples/Colmin/colmin_golden.py | 753 +---------------- test/samples/Colsum/colsum_compare.py | 741 +---------------- test/samples/Colsum/colsum_golden.py | 756 +----------------- test/samples/Div/div_compare.py | 741 +---------------- test/samples/Div/div_golden.py | 742 +---------------- test/samples/Divs/divs_compare.py | 741 +---------------- test/samples/Divs/divs_golden.py | 744 +---------------- test/samples/Divs2/divs2_compare.py | 741 +---------------- test/samples/Divs2/divs2_golden.py | 744 +---------------- test/samples/Exp/exp_compare.py | 741 +---------------- test/samples/Exp/exp_golden.py | 744 +---------------- test/samples/Expands/expand_compare.py | 741 +---------------- test/samples/Expands/expand_golden.py | 742 +---------------- test/samples/Expands/expands_compare.py | 741 +---------------- test/samples/Expands/expands_golden.py | 742 +---------------- test/samples/Log/log_compare.py | 741 +---------------- test/samples/Log/log_golden.py | 744 +---------------- test/samples/Lrelu/lrelu_compare.py | 741 +---------------- test/samples/Lrelu/lrelu_golden.py | 744 +---------------- test/samples/Max/max_compare.py | 741 +---------------- test/samples/Max/max_golden.py | 742 +---------------- test/samples/Maxs/maxs_compare.py | 741 +---------------- test/samples/Maxs/maxs_golden.py | 744 +---------------- test/samples/Min/min_compare.py | 741 +---------------- test/samples/Min/min_golden.py | 742 +---------------- test/samples/Mins/mins_compare.py | 741 +---------------- test/samples/Mins/mins_golden.py | 744 +---------------- test/samples/Mul/mul_compare.py | 741 +---------------- test/samples/Mul/mul_golden.py | 742 +---------------- test/samples/Muls/muls_compare.py | 741 +---------------- test/samples/Muls/muls_golden.py | 744 +---------------- test/samples/Neg/neg_compare.py | 741 +---------------- test/samples/Neg/neg_golden.py | 744 +---------------- test/samples/Not/not_compare.py | 741 +---------------- test/samples/Not/not_golden.py | 744 +---------------- test/samples/Or/or_compare.py | 741 +---------------- test/samples/Or/or_golden.py | 744 +---------------- test/samples/Ors/ors_compare.py | 741 +---------------- test/samples/Ors/ors_golden.py | 744 +---------------- test/samples/Partadd/partadd_compare.py | 741 +---------------- test/samples/Partadd/partadd_golden.py | 742 +---------------- test/samples/Partmax/partmax_compare.py | 741 +---------------- test/samples/Partmax/partmax_golden.py | 742 +---------------- test/samples/Partmin/partmin_compare.py | 741 +---------------- test/samples/Partmin/partmin_golden.py | 742 +---------------- test/samples/Prelu/prelu_compare.py | 741 +---------------- test/samples/Prelu/prelu_golden.py | 740 +---------------- test/samples/Recip/recip_compare.py | 741 +---------------- test/samples/Recip/recip_golden.py | 744 +---------------- test/samples/Relu/relu_compare.py | 741 +---------------- test/samples/Relu/relu_golden.py | 744 +---------------- test/samples/Rem/rem_compare.py | 741 +---------------- test/samples/Rem/rem_golden.py | 742 +---------------- test/samples/Rems/rems_compare.py | 741 +---------------- test/samples/Rems/rems_golden.py | 744 +---------------- test/samples/Rowexpand/rowexpand_compare.py | 741 +---------------- test/samples/Rowexpand/rowexpand_golden.py | 745 +---------------- .../Rowexpanddiv/rowexpanddiv_compare.py | 741 +---------------- .../Rowexpanddiv/rowexpanddiv_golden.py | 743 +---------------- .../Rowexpandmul/rowexpandmul_compare.py | 741 +---------------- .../Rowexpandmul/rowexpandmul_golden.py | 743 +---------------- .../Rowexpandsub/rowexpandsub_compare.py | 741 +---------------- .../Rowexpandsub/rowexpandsub_golden.py | 743 +---------------- test/samples/Rowmax/rowmax_compare.py | 741 +---------------- test/samples/Rowmax/rowmax_golden.py | 753 +---------------- test/samples/Rowmin/rowmin_compare.py | 741 +---------------- test/samples/Rowmin/rowmin_golden.py | 753 +---------------- test/samples/Rowsum/rowsum_compare.py | 741 +---------------- test/samples/Rowsum/rowsum_golden.py | 753 +---------------- test/samples/Rsqrt/rsqrt_compare.py | 741 +---------------- test/samples/Rsqrt/rsqrt_golden.py | 744 +---------------- test/samples/Sel/sel_compare.py | 741 +---------------- test/samples/Sel/sel_golden.py | 740 +---------------- test/samples/Sels/sels_compare.py | 741 +---------------- test/samples/Sels/sels_golden.py | 742 +---------------- test/samples/Shl/shl_compare.py | 741 +---------------- test/samples/Shl/shl_golden.py | 744 +---------------- test/samples/Shls/shls_compare.py | 741 +---------------- test/samples/Shls/shls_golden.py | 744 +---------------- test/samples/Shr/shr_compare.py | 741 +---------------- test/samples/Shr/shr_golden.py | 744 +---------------- test/samples/Shrs/shrs_compare.py | 741 +---------------- test/samples/Shrs/shrs_golden.py | 744 +---------------- test/samples/Sqrt/sqrt_compare.py | 741 +---------------- test/samples/Sqrt/sqrt_golden.py | 744 +---------------- test/samples/Sub/sub_compare.py | 741 +---------------- test/samples/Sub/sub_golden.py | 742 +---------------- test/samples/Subc/subc_compare.py | 741 +---------------- test/samples/Subc/subc_golden.py | 742 +---------------- test/samples/Subs/subs_compare.py | 741 +---------------- test/samples/Subs/subs_golden.py | 744 +---------------- test/samples/Subsc/subsc_compare.py | 741 +---------------- test/samples/Subsc/subsc_golden.py | 744 +---------------- .../VectorAddition/vadd_pto_ir_compare.py | 741 +---------------- .../VectorAddition/vadd_pto_ir_golden.py | 742 +---------------- .../VectorAddition/vectorAddition_compare.py | 741 +---------------- .../VectorAddition/vectorAddition_golden.py | 742 +---------------- test/samples/Xor/xor_compare.py | 741 +---------------- test/samples/Xor/xor_golden.py | 744 +---------------- test/samples/Xors/xors_compare.py | 741 +---------------- test/samples/Xors/xors_golden.py | 744 +---------------- test/samples/validation_runtime.py | 273 +++++++ 125 files changed, 1931 insertions(+), 88944 deletions(-) create mode 100644 test/samples/validation_runtime.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e527a13..b5074c19 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -210,7 +210,7 @@ jobs: cp "${file}" "${dst}" done < <( find test/samples \ - \( -path '*/npu_validation/*' -o -name '*_golden.py' -o -name '*_compare.py' \) \ + \( -path '*/npu_validation/*' -o -name '*_golden.py' -o -name '*_compare.py' -o -name 'validation_runtime.py' \) \ -type f -print0 ) chmod +x "${PAYLOAD_DIR}/test/npu_validation/scripts/run_remote_npu_validation.sh" diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 481f7b81..619582f7 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -851,6 +851,7 @@ def generate_testcase( custom_golden = _find_custom_case_asset(sample_root, testcase, "golden.py") custom_compare = _find_custom_case_asset(sample_root, testcase, "compare.py") + shared_validation_runtime = sample_root.parent / "validation_runtime.py" raw_kernel = input_cpp.read_text(encoding="utf-8") raw_kernel_for_analysis = raw_kernel @@ -1174,6 +1175,8 @@ def generate_testcase( else: golden_py = golden_template.replace("@INPUT_GENERATE@", "\n".join(input_generate)) golden_dst.write_text(golden_py, encoding="utf-8") + if (custom_golden is not None or custom_compare is not None) and shared_validation_runtime.is_file(): + _copy_asset_if_needed(shared_validation_runtime, output_dir / "validation_runtime.py") # Emit the kernel source, optionally injecting a packed-predicate preload to # make TCMP/TCMPS outputs deterministic for byte-wise compares. diff --git a/test/samples/Abs/abs_compare.py b/test/samples/Abs/abs_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Abs/abs_compare.py +++ b/test/samples/Abs/abs_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Abs/abs_golden.py b/test/samples/Abs/abs_golden.py index b26aa123..d009ea94 100755 --- a/test/samples/Abs/abs_golden.py +++ b/test/samples/Abs/abs_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.abs(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("abs") +if __name__ == '__main__': + main() diff --git a/test/samples/Addc/addc_compare.py b/test/samples/Addc/addc_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Addc/addc_compare.py +++ b/test/samples/Addc/addc_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Addc/addc_golden.py b/test/samples/Addc/addc_golden.py index a9dc8190..e897dc7a 100755 --- a/test/samples/Addc/addc_golden.py +++ b/test/samples/Addc/addc_golden.py @@ -1,737 +1,31 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_addc_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) + generator = rng() + a = float_values(generator, meta.elem_counts[a_name], style='signed') + b = float_values(generator, meta.elem_counts[b_name], style='signed') + c = float_values(generator, meta.elem_counts[c_name], style='signed_small') + buffers = default_buffers(meta) buffers[a_name] = a buffers[b_name] = b buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = a + b + c + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_addc_case("addc") +if __name__ == '__main__': + main() diff --git a/test/samples/Adds/adds_compare.py b/test/samples/Adds/adds_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Adds/adds_compare.py +++ b/test/samples/Adds/adds_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Adds/adds_golden.py b/test/samples/Adds/adds_golden.py index 1937f3b2..e98a225f 100755 --- a/test/samples/Adds/adds_golden.py +++ b/test/samples/Adds/adds_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src + np.float32(3.14) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("adds", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Addsc/addsc_compare.py b/test/samples/Addsc/addsc_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Addsc/addsc_compare.py +++ b/test/samples/Addsc/addsc_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Addsc/addsc_golden.py b/test/samples/Addsc/addsc_golden.py index 1ca45a9e..b2fb27d4 100755 --- a/test/samples/Addsc/addsc_golden.py +++ b/test/samples/Addsc/addsc_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src + np.float32(3.14) + src + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_carry_case("addsc", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/And/and_compare.py b/test/samples/And/and_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/And/and_compare.py +++ b/test/samples/And/and_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/And/and_golden.py b/test/samples/And/and_golden.py index 93c5af66..5306267a 100755 --- a/test/samples/And/and_golden.py +++ b/test/samples/And/and_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_and(src, src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("and", np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Ands/ands_compare.py b/test/samples/Ands/ands_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Ands/ands_compare.py +++ b/test/samples/Ands/ands_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Ands/ands_golden.py b/test/samples/Ands/ands_golden.py index 4905e5b2..0cc1060d 100755 --- a/test/samples/Ands/ands_golden.py +++ b/test/samples/Ands/ands_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_and(src, np.asarray(88, dtype=np.int16).item()) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_scalar_case("ands", 88, np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Cmp/cmp_compare.py b/test/samples/Cmp/cmp_compare.py index 00382a69..4c411476 100755 --- a/test/samples/Cmp/cmp_compare.py +++ b/test/samples/Cmp/cmp_compare.py @@ -1,737 +1,13 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_packed_mask_outputs -if __name__ == "__main__": - compare_all_packed_mask_outputs() +if __name__ == '__main__': + compare_packed_mask_outputs() diff --git a/test/samples/Cmp/cmp_golden.py b/test/samples/Cmp/cmp_golden.py index 44fe0111..d0c7dd3b 100755 --- a/test/samples/Cmp/cmp_golden.py +++ b/test/samples/Cmp/cmp_golden.py @@ -1,737 +1,32 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) +from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden -def generate_rowexpand_bin_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) + generator = rng() + src0 = float_values(generator, meta.elem_counts[src0_name], style='cmp') + src1 = float_values(generator, meta.elem_counts[src1_name], style='cmp') + pred = matrix32(src0) < matrix32(src1) + buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") + write_buffers(meta, buffers) + out_name = single_output(meta) storage_cols = meta.elem_counts[out_name] // ROWS packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_golden(meta, {out_name: packed}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_cmp_case("cmp") +if __name__ == '__main__': + main() diff --git a/test/samples/Cmps/cmps_compare.py b/test/samples/Cmps/cmps_compare.py index 00382a69..4c411476 100755 --- a/test/samples/Cmps/cmps_compare.py +++ b/test/samples/Cmps/cmps_compare.py @@ -1,737 +1,13 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_packed_mask_outputs -if __name__ == "__main__": - compare_all_packed_mask_outputs() +if __name__ == '__main__': + compare_packed_mask_outputs() diff --git a/test/samples/Cmps/cmps_golden.py b/test/samples/Cmps/cmps_golden.py index 208c4d8c..301f8bcc 100755 --- a/test/samples/Cmps/cmps_golden.py +++ b/test/samples/Cmps/cmps_golden.py @@ -1,737 +1,30 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) +from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden -def generate_colexpand_case(): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='cmp') + pred = matrix32(src) > np.float32(1.0) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS + write_buffers(meta, buffers) + out_name = single_output(meta) + storage_cols = meta.elem_counts[out_name] // 32 packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_golden(meta, {out_name: packed}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_cmp_case("cmps", scalar=1.0) +if __name__ == '__main__': + main() diff --git a/test/samples/Colexpand/colexpand_compare.py b/test/samples/Colexpand/colexpand_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Colexpand/colexpand_compare.py +++ b/test/samples/Colexpand/colexpand_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Colexpand/colexpand_golden.py b/test/samples/Colexpand/colexpand_golden.py index f065cf74..119d6392 100755 --- a/test/samples/Colexpand/colexpand_golden.py +++ b/test/samples/Colexpand/colexpand_golden.py @@ -1,737 +1,28 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.repeat(src_m[:1, :], 32, axis=0) + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_colexpand_case() +if __name__ == '__main__': + main() diff --git a/test/samples/Colmax/colmax_compare.py b/test/samples/Colmax/colmax_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Colmax/colmax_compare.py +++ b/test/samples/Colmax/colmax_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Colmax/colmax_golden.py b/test/samples/Colmax/colmax_golden.py index d1631bde..992ae5f1 100755 --- a/test/samples/Colmax/colmax_golden.py +++ b/test/samples/Colmax/colmax_golden.py @@ -1,737 +1,38 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + write_buffers(meta, buffers) + reduced = np.asarray(src_m.max(axis=0), dtype=np.float32) + out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[0, :] = reduced + out = out_m.reshape(-1) + elif out.size == COLS: + out = reduced else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported col-reduce output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_col_reduce_case("colmax", accumulate=False) +if __name__ == '__main__': + main() diff --git a/test/samples/Colmin/colmin_compare.py b/test/samples/Colmin/colmin_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Colmin/colmin_compare.py +++ b/test/samples/Colmin/colmin_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Colmin/colmin_golden.py b/test/samples/Colmin/colmin_golden.py index 183ce333..d8081173 100755 --- a/test/samples/Colmin/colmin_golden.py +++ b/test/samples/Colmin/colmin_golden.py @@ -1,737 +1,38 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + write_buffers(meta, buffers) + reduced = np.asarray(src_m.min(axis=0), dtype=np.float32) + out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[0, :] = reduced + out = out_m.reshape(-1) + elif out.size == COLS: + out = reduced else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported col-reduce output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_col_reduce_case("colmin", accumulate=False) +if __name__ == '__main__': + main() diff --git a/test/samples/Colsum/colsum_compare.py b/test/samples/Colsum/colsum_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Colsum/colsum_compare.py +++ b/test/samples/Colsum/colsum_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Colsum/colsum_golden.py b/test/samples/Colsum/colsum_golden.py index f50d62d7..2408b9f8 100755 --- a/test/samples/Colsum/colsum_golden.py +++ b/test/samples/Colsum/colsum_golden.py @@ -1,737 +1,41 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + src_name, tmp_name = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + out_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) + buffers[out_name] = out_init + write_buffers(meta, buffers) + reduced = np.asarray(src_m.sum(axis=0, dtype=np.float32), dtype=np.float32) + out = np.asarray(out_init, dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[0, :] = reduced + out_m[0, :] + out = out_m.reshape(-1) + elif out.size == COLS: + out = reduced + out else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported colsum output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_col_reduce_case("colsum", accumulate=True) +if __name__ == '__main__': + main() diff --git a/test/samples/Div/div_compare.py b/test/samples/Div/div_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Div/div_compare.py +++ b/test/samples/Div/div_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Div/div_golden.py b/test/samples/Div/div_golden.py index 1f6f9f3b..a5eafc1a 100755 --- a/test/samples/Div/div_golden.py +++ b/test/samples/Div/div_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='nonzero_signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs / rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("div") +if __name__ == '__main__': + main() diff --git a/test/samples/Divs/divs_compare.py b/test/samples/Divs/divs_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Divs/divs_compare.py +++ b/test/samples/Divs/divs_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Divs/divs_golden.py b/test/samples/Divs/divs_golden.py index e6b841f1..af2ac1a0 100755 --- a/test/samples/Divs/divs_golden.py +++ b/test/samples/Divs/divs_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src / np.float32(3.14) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("divs", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Divs2/divs2_compare.py b/test/samples/Divs2/divs2_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Divs2/divs2_compare.py +++ b/test/samples/Divs2/divs2_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Divs2/divs2_golden.py b/test/samples/Divs2/divs2_golden.py index 13242400..77846158 100755 --- a/test/samples/Divs2/divs2_golden.py +++ b/test/samples/Divs2/divs2_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='nonzero_signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.float32(3.14) / src + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("divs", 3.14, scalar_left=True) +if __name__ == '__main__': + main() diff --git a/test/samples/Exp/exp_compare.py b/test/samples/Exp/exp_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Exp/exp_compare.py +++ b/test/samples/Exp/exp_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Exp/exp_golden.py b/test/samples/Exp/exp_golden.py index ed2a1dc1..2f0dcbeb 100755 --- a/test/samples/Exp/exp_golden.py +++ b/test/samples/Exp/exp_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='exp') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.exp(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("exp") +if __name__ == '__main__': + main() diff --git a/test/samples/Expands/expand_compare.py b/test/samples/Expands/expand_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Expands/expand_compare.py +++ b/test/samples/Expands/expand_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Expands/expand_golden.py b/test/samples/Expands/expand_golden.py index b5d7626c..d46d860d 100755 --- a/test/samples/Expands/expand_golden.py +++ b/test/samples/Expands/expand_golden.py @@ -1,737 +1,23 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) +from validation_runtime import default_buffers, load_case_meta, single_output, write_buffers, write_golden -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): +def main(): meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, default_buffers(meta)) + out_name = single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(3.14), dtype=np.float32) + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_expands_case(3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Expands/expands_compare.py b/test/samples/Expands/expands_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Expands/expands_compare.py +++ b/test/samples/Expands/expands_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Expands/expands_golden.py b/test/samples/Expands/expands_golden.py index b5d7626c..d46d860d 100755 --- a/test/samples/Expands/expands_golden.py +++ b/test/samples/Expands/expands_golden.py @@ -1,737 +1,23 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) +from validation_runtime import default_buffers, load_case_meta, single_output, write_buffers, write_golden -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): +def main(): meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, default_buffers(meta)) + out_name = single_output(meta) + out = np.full(meta.elem_counts[out_name], np.float32(3.14), dtype=np.float32) + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_expands_case(3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Log/log_compare.py b/test/samples/Log/log_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Log/log_compare.py +++ b/test/samples/Log/log_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Log/log_golden.py b/test/samples/Log/log_golden.py index ae9746a6..8474aeb1 100755 --- a/test/samples/Log/log_golden.py +++ b/test/samples/Log/log_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='positive') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.log(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("log") +if __name__ == '__main__': + main() diff --git a/test/samples/Lrelu/lrelu_compare.py b/test/samples/Lrelu/lrelu_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Lrelu/lrelu_compare.py +++ b/test/samples/Lrelu/lrelu_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Lrelu/lrelu_golden.py b/test/samples/Lrelu/lrelu_golden.py index 1be0c688..119e8711 100755 --- a/test/samples/Lrelu/lrelu_golden.py +++ b/test/samples/Lrelu/lrelu_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.where(src > 0.0, src, src * np.float32(3.14)) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("lrelu", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Max/max_compare.py b/test/samples/Max/max_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Max/max_compare.py +++ b/test/samples/Max/max_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Max/max_golden.py b/test/samples/Max/max_golden.py index 41cf4f08..e925a6a6 100755 --- a/test/samples/Max/max_golden.py +++ b/test/samples/Max/max_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.maximum(lhs, rhs) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("max") +if __name__ == '__main__': + main() diff --git a/test/samples/Maxs/maxs_compare.py b/test/samples/Maxs/maxs_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Maxs/maxs_compare.py +++ b/test/samples/Maxs/maxs_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Maxs/maxs_golden.py b/test/samples/Maxs/maxs_golden.py index 240b317f..2a18be2c 100755 --- a/test/samples/Maxs/maxs_golden.py +++ b/test/samples/Maxs/maxs_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.maximum(src, np.float32(3.14)) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("maxs", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Min/min_compare.py b/test/samples/Min/min_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Min/min_compare.py +++ b/test/samples/Min/min_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Min/min_golden.py b/test/samples/Min/min_golden.py index 5da825df..d620b49a 100755 --- a/test/samples/Min/min_golden.py +++ b/test/samples/Min/min_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.minimum(lhs, rhs) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("min") +if __name__ == '__main__': + main() diff --git a/test/samples/Mins/mins_compare.py b/test/samples/Mins/mins_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Mins/mins_compare.py +++ b/test/samples/Mins/mins_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Mins/mins_golden.py b/test/samples/Mins/mins_golden.py index 5d1a7293..e0f85b23 100755 --- a/test/samples/Mins/mins_golden.py +++ b/test/samples/Mins/mins_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.minimum(src, np.float32(3.14)) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("mins", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Mul/mul_compare.py b/test/samples/Mul/mul_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Mul/mul_compare.py +++ b/test/samples/Mul/mul_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Mul/mul_golden.py b/test/samples/Mul/mul_golden.py index 5af50d07..d100daa5 100755 --- a/test/samples/Mul/mul_golden.py +++ b/test/samples/Mul/mul_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs * rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("mul") +if __name__ == '__main__': + main() diff --git a/test/samples/Muls/muls_compare.py b/test/samples/Muls/muls_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Muls/muls_compare.py +++ b/test/samples/Muls/muls_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Muls/muls_golden.py b/test/samples/Muls/muls_golden.py index bc3212a8..320cd2be 100755 --- a/test/samples/Muls/muls_golden.py +++ b/test/samples/Muls/muls_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src * np.float32(3.14) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("muls", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Neg/neg_compare.py b/test/samples/Neg/neg_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Neg/neg_compare.py +++ b/test/samples/Neg/neg_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Neg/neg_golden.py b/test/samples/Neg/neg_golden.py index d1088c95..90012ad8 100755 --- a/test/samples/Neg/neg_golden.py +++ b/test/samples/Neg/neg_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = -src + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("neg") +if __name__ == '__main__': + main() diff --git a/test/samples/Not/not_compare.py b/test/samples/Not/not_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Not/not_compare.py +++ b/test/samples/Not/not_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Not/not_golden.py b/test/samples/Not/not_golden.py index 805f4e31..fc1a6745 100755 --- a/test/samples/Not/not_golden.py +++ b/test/samples/Not/not_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_not(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("not", np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Or/or_compare.py b/test/samples/Or/or_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Or/or_compare.py +++ b/test/samples/Or/or_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Or/or_golden.py b/test/samples/Or/or_golden.py index 257b3900..d5b15199 100755 --- a/test/samples/Or/or_golden.py +++ b/test/samples/Or/or_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_or(src, src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("or", np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Ors/ors_compare.py b/test/samples/Ors/ors_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Ors/ors_compare.py +++ b/test/samples/Ors/ors_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Ors/ors_golden.py b/test/samples/Ors/ors_golden.py index 9ba60243..8f5cb3c5 100755 --- a/test/samples/Ors/ors_golden.py +++ b/test/samples/Ors/ors_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_or(src, np.asarray(88, dtype=np.int16).item()) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_scalar_case("ors", 88, np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Partadd/partadd_compare.py b/test/samples/Partadd/partadd_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Partadd/partadd_compare.py +++ b/test/samples/Partadd/partadd_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Partadd/partadd_golden.py b/test/samples/Partadd/partadd_golden.py index b193bfff..d6832130 100755 --- a/test/samples/Partadd/partadd_golden.py +++ b/test/samples/Partadd/partadd_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs + rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("add") +if __name__ == '__main__': + main() diff --git a/test/samples/Partmax/partmax_compare.py b/test/samples/Partmax/partmax_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Partmax/partmax_compare.py +++ b/test/samples/Partmax/partmax_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Partmax/partmax_golden.py b/test/samples/Partmax/partmax_golden.py index 41cf4f08..e925a6a6 100755 --- a/test/samples/Partmax/partmax_golden.py +++ b/test/samples/Partmax/partmax_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.maximum(lhs, rhs) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("max") +if __name__ == '__main__': + main() diff --git a/test/samples/Partmin/partmin_compare.py b/test/samples/Partmin/partmin_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Partmin/partmin_compare.py +++ b/test/samples/Partmin/partmin_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Partmin/partmin_golden.py b/test/samples/Partmin/partmin_golden.py index 5da825df..d620b49a 100755 --- a/test/samples/Partmin/partmin_golden.py +++ b/test/samples/Partmin/partmin_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.minimum(lhs, rhs) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("min") +if __name__ == '__main__': + main() diff --git a/test/samples/Prelu/prelu_compare.py b/test/samples/Prelu/prelu_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Prelu/prelu_compare.py +++ b/test/samples/Prelu/prelu_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Prelu/prelu_golden.py b/test/samples/Prelu/prelu_golden.py index 84dd74ff..4d14cf76 100755 --- a/test/samples/Prelu/prelu_golden.py +++ b/test/samples/Prelu/prelu_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_prelu_case(): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + slope = float_values(generator, meta.elem_counts[slope_name], style='signed_small') + buffers = default_buffers(meta) buffers[src_name] = src buffers[slope_name] = slope - _write_buffers(meta, buffers) + write_buffers(meta, buffers) out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_prelu_case() +if __name__ == '__main__': + main() diff --git a/test/samples/Recip/recip_compare.py b/test/samples/Recip/recip_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Recip/recip_compare.py +++ b/test/samples/Recip/recip_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Recip/recip_golden.py b/test/samples/Recip/recip_golden.py index dbc359d1..267b826c 100755 --- a/test/samples/Recip/recip_golden.py +++ b/test/samples/Recip/recip_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='positive') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = 1.0 / src + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("recip") +if __name__ == '__main__': + main() diff --git a/test/samples/Relu/relu_compare.py b/test/samples/Relu/relu_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Relu/relu_compare.py +++ b/test/samples/Relu/relu_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Relu/relu_golden.py b/test/samples/Relu/relu_golden.py index 3e014456..0f8dba16 100755 --- a/test/samples/Relu/relu_golden.py +++ b/test/samples/Relu/relu_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.maximum(src, np.float32(0.0)) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("relu") +if __name__ == '__main__': + main() diff --git a/test/samples/Rem/rem_compare.py b/test/samples/Rem/rem_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Rem/rem_compare.py +++ b/test/samples/Rem/rem_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Rem/rem_golden.py b/test/samples/Rem/rem_golden.py index 1e439e88..f3c77d2c 100755 --- a/test/samples/Rem/rem_golden.py +++ b/test/samples/Rem/rem_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='nonzero_signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.fmod(lhs, rhs) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("rem") +if __name__ == '__main__': + main() diff --git a/test/samples/Rems/rems_compare.py b/test/samples/Rems/rems_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Rems/rems_compare.py +++ b/test/samples/Rems/rems_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Rems/rems_golden.py b/test/samples/Rems/rems_golden.py index 966abd2d..80e6f564 100755 --- a/test/samples/Rems/rems_golden.py +++ b/test/samples/Rems/rems_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.fmod(src, np.float32(3.14)) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("rems", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Rowexpand/rowexpand_compare.py b/test/samples/Rowexpand/rowexpand_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Rowexpand/rowexpand_compare.py +++ b/test/samples/Rowexpand/rowexpand_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Rowexpand/rowexpand_golden.py b/test/samples/Rowexpand/rowexpand_golden.py index 38a11172..8e53085f 100755 --- a/test/samples/Rowexpand/rowexpand_golden.py +++ b/test/samples/Rowexpand/rowexpand_golden.py @@ -1,737 +1,28 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.repeat(src_m[:, :1], 32, axis=1) + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_rowexpand_case() +if __name__ == '__main__': + main() diff --git a/test/samples/Rowexpanddiv/rowexpanddiv_compare.py b/test/samples/Rowexpanddiv/rowexpanddiv_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Rowexpanddiv/rowexpanddiv_compare.py +++ b/test/samples/Rowexpanddiv/rowexpanddiv_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Rowexpanddiv/rowexpanddiv_golden.py b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py index 9810472b..cf7134bc 100755 --- a/test/samples/Rowexpanddiv/rowexpanddiv_golden.py +++ b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py @@ -1,737 +1,32 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) +from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_rowexpand_bin_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) + generator = rng() + src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpanddiv/rowexpanddiv_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src0_m = matrix32(src0) + src1_m = matrix32(src1) row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) + buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src0_m / row_scalars[:, None] + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_rowexpand_bin_case("rowexpanddiv") +if __name__ == '__main__': + main() diff --git a/test/samples/Rowexpandmul/rowexpandmul_compare.py b/test/samples/Rowexpandmul/rowexpandmul_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Rowexpandmul/rowexpandmul_compare.py +++ b/test/samples/Rowexpandmul/rowexpandmul_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Rowexpandmul/rowexpandmul_golden.py b/test/samples/Rowexpandmul/rowexpandmul_golden.py index 4611be12..5bbd3405 100755 --- a/test/samples/Rowexpandmul/rowexpandmul_golden.py +++ b/test/samples/Rowexpandmul/rowexpandmul_golden.py @@ -1,737 +1,32 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) +from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_rowexpand_bin_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) + generator = rng() + src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpandmul/rowexpandmul_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src0_m = matrix32(src0) + src1_m = matrix32(src1) row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) + buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src0_m * row_scalars[:, None] + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_rowexpand_bin_case("rowexpandmul") +if __name__ == '__main__': + main() diff --git a/test/samples/Rowexpandsub/rowexpandsub_compare.py b/test/samples/Rowexpandsub/rowexpandsub_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Rowexpandsub/rowexpandsub_compare.py +++ b/test/samples/Rowexpandsub/rowexpandsub_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Rowexpandsub/rowexpandsub_golden.py b/test/samples/Rowexpandsub/rowexpandsub_golden.py index 0016d52b..c6b3d67c 100755 --- a/test/samples/Rowexpandsub/rowexpandsub_golden.py +++ b/test/samples/Rowexpandsub/rowexpandsub_golden.py @@ -1,737 +1,32 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) +from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_rowexpand_bin_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) + generator = rng() + src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpandsub/rowexpandsub_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src0_m = matrix32(src0) + src1_m = matrix32(src1) row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) + buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src0_m - row_scalars[:, None] + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_rowexpand_bin_case("rowexpandsub") +if __name__ == '__main__': + main() diff --git a/test/samples/Rowmax/rowmax_compare.py b/test/samples/Rowmax/rowmax_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Rowmax/rowmax_compare.py +++ b/test/samples/Rowmax/rowmax_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Rowmax/rowmax_golden.py b/test/samples/Rowmax/rowmax_golden.py index 71a790f9..74521a93 100755 --- a/test/samples/Rowmax/rowmax_golden.py +++ b/test/samples/Rowmax/rowmax_golden.py @@ -1,737 +1,38 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + write_buffers(meta, buffers) + reduced = np.asarray(src_m.max(axis=1), dtype=np.float32) + out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[:, 0] = reduced + out = out_m.reshape(-1) + elif out.size == ROWS: + out = reduced else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported row-reduce output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_row_reduce_case("rowmax") +if __name__ == '__main__': + main() diff --git a/test/samples/Rowmin/rowmin_compare.py b/test/samples/Rowmin/rowmin_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Rowmin/rowmin_compare.py +++ b/test/samples/Rowmin/rowmin_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Rowmin/rowmin_golden.py b/test/samples/Rowmin/rowmin_golden.py index d5d5ed08..f5295a56 100755 --- a/test/samples/Rowmin/rowmin_golden.py +++ b/test/samples/Rowmin/rowmin_golden.py @@ -1,737 +1,38 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + write_buffers(meta, buffers) + reduced = np.asarray(src_m.min(axis=1), dtype=np.float32) + out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[:, 0] = reduced + out = out_m.reshape(-1) + elif out.size == ROWS: + out = reduced else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported row-reduce output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_row_reduce_case("rowmin") +if __name__ == '__main__': + main() diff --git a/test/samples/Rowsum/rowsum_compare.py b/test/samples/Rowsum/rowsum_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Rowsum/rowsum_compare.py +++ b/test/samples/Rowsum/rowsum_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Rowsum/rowsum_golden.py b/test/samples/Rowsum/rowsum_golden.py index bddeaf31..aed41409 100755 --- a/test/samples/Rowsum/rowsum_golden.py +++ b/test/samples/Rowsum/rowsum_golden.py @@ -1,737 +1,38 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + src_m = matrix32(src) + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) + write_buffers(meta, buffers) + reduced = np.asarray(src_m.sum(axis=1, dtype=np.float32), dtype=np.float32) + out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + if out.size == ROWS * COLS: + out_m = matrix32(out) + out_m[:, 0] = reduced + out = out_m.reshape(-1) + elif out.size == ROWS: + out = reduced else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + raise ValueError(f'unsupported row-reduce output size: {out.size}') + write_golden(meta, {out_name: out}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_row_reduce_case("rowsum") +if __name__ == '__main__': + main() diff --git a/test/samples/Rsqrt/rsqrt_compare.py b/test/samples/Rsqrt/rsqrt_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Rsqrt/rsqrt_compare.py +++ b/test/samples/Rsqrt/rsqrt_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Rsqrt/rsqrt_golden.py b/test/samples/Rsqrt/rsqrt_golden.py index 9959d8ef..fc44506e 100755 --- a/test/samples/Rsqrt/rsqrt_golden.py +++ b/test/samples/Rsqrt/rsqrt_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='positive') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = 1.0 / np.sqrt(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("rsqrt") +if __name__ == '__main__': + main() diff --git a/test/samples/Sel/sel_compare.py b/test/samples/Sel/sel_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Sel/sel_compare.py +++ b/test/samples/Sel/sel_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Sel/sel_golden.py b/test/samples/Sel/sel_golden.py index bb714af8..880988bb 100755 --- a/test/samples/Sel/sel_golden.py +++ b/test/samples/Sel/sel_golden.py @@ -1,737 +1,33 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) +from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden -def generate_sel_case(): +def main(): meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() mask_name, src0_name, src1_name = meta.inputs + generator = rng() storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) + mask_bits = generator.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) + src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='signed') + buffers = default_buffers(meta) buffers[mask_name] = mask buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.where(mask_bits, matrix32(src0), matrix32(src1)) + write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_sel_case() +if __name__ == '__main__': + main() diff --git a/test/samples/Sels/sels_compare.py b/test/samples/Sels/sels_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Sels/sels_compare.py +++ b/test/samples/Sels/sels_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Sels/sels_golden.py b/test/samples/Sels/sels_golden.py index 37417eb0..f4830932 100755 --- a/test/samples/Sels/sels_golden.py +++ b/test/samples/Sels/sels_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sel_case(): +def main(): meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) + generator = rng() + src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='signed') + buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src0 if 64 == 1 else src1 + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_sels_case(64) +if __name__ == '__main__': + main() diff --git a/test/samples/Shl/shl_compare.py b/test/samples/Shl/shl_compare.py index 165980ed..8abe2165 100755 --- a/test/samples/Shl/shl_compare.py +++ b/test/samples/Shl/shl_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int32, 0.0) +if __name__ == '__main__': + compare_outputs(np.int32, atol=0.0) diff --git a/test/samples/Shl/shl_golden.py b/test/samples/Shl/shl_golden.py index be681c3a..4d5c8139 100755 --- a/test/samples/Shl/shl_golden.py +++ b/test/samples/Shl/shl_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int32, style='shift_small') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.left_shift(src, src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("shl", np.int32) +if __name__ == '__main__': + main() diff --git a/test/samples/Shls/shls_compare.py b/test/samples/Shls/shls_compare.py index 165980ed..8abe2165 100755 --- a/test/samples/Shls/shls_compare.py +++ b/test/samples/Shls/shls_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int32, 0.0) +if __name__ == '__main__': + compare_outputs(np.int32, atol=0.0) diff --git a/test/samples/Shls/shls_golden.py b/test/samples/Shls/shls_golden.py index 020268aa..1cef4efe 100755 --- a/test/samples/Shls/shls_golden.py +++ b/test/samples/Shls/shls_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int32, style='shift_small') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.left_shift(src, np.asarray(2, dtype=np.int32).item()) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_scalar_case("shls", 2, np.int32) +if __name__ == '__main__': + main() diff --git a/test/samples/Shr/shr_compare.py b/test/samples/Shr/shr_compare.py index 165980ed..8abe2165 100755 --- a/test/samples/Shr/shr_compare.py +++ b/test/samples/Shr/shr_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int32, 0.0) +if __name__ == '__main__': + compare_outputs(np.int32, atol=0.0) diff --git a/test/samples/Shr/shr_golden.py b/test/samples/Shr/shr_golden.py index fdadc6b5..e2affab7 100755 --- a/test/samples/Shr/shr_golden.py +++ b/test/samples/Shr/shr_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int32, style='shift_small') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.right_shift(src, src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("shr", np.int32) +if __name__ == '__main__': + main() diff --git a/test/samples/Shrs/shrs_compare.py b/test/samples/Shrs/shrs_compare.py index 165980ed..8abe2165 100755 --- a/test/samples/Shrs/shrs_compare.py +++ b/test/samples/Shrs/shrs_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int32, 0.0) +if __name__ == '__main__': + compare_outputs(np.int32, atol=0.0) diff --git a/test/samples/Shrs/shrs_golden.py b/test/samples/Shrs/shrs_golden.py index 91c87797..4b6320a2 100755 --- a/test/samples/Shrs/shrs_golden.py +++ b/test/samples/Shrs/shrs_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int32, style='shift_small') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.right_shift(src, np.asarray(2, dtype=np.int32).item()) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_scalar_case("shrs", 2, np.int32) +if __name__ == '__main__': + main() diff --git a/test/samples/Sqrt/sqrt_compare.py b/test/samples/Sqrt/sqrt_compare.py index 081d562c..03205d0a 100755 --- a/test/samples/Sqrt/sqrt_compare.py +++ b/test/samples/Sqrt/sqrt_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-3) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.001) diff --git a/test/samples/Sqrt/sqrt_golden.py b/test/samples/Sqrt/sqrt_golden.py index 479de04b..f441092a 100755 --- a/test/samples/Sqrt/sqrt_golden.py +++ b/test/samples/Sqrt/sqrt_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='positive') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.sqrt(src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_unary_float_case("sqrt") +if __name__ == '__main__': + main() diff --git a/test/samples/Sub/sub_compare.py b/test/samples/Sub/sub_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Sub/sub_compare.py +++ b/test/samples/Sub/sub_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Sub/sub_golden.py b/test/samples/Sub/sub_golden.py index 6fb865db..593d8070 100755 --- a/test/samples/Sub/sub_golden.py +++ b/test/samples/Sub/sub_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs - rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("sub") +if __name__ == '__main__': + main() diff --git a/test/samples/Subc/subc_compare.py b/test/samples/Subc/subc_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Subc/subc_compare.py +++ b/test/samples/Subc/subc_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Subc/subc_golden.py b/test/samples/Subc/subc_golden.py index fcb6a7c5..f782b058 100755 --- a/test/samples/Subc/subc_golden.py +++ b/test/samples/Subc/subc_golden.py @@ -1,737 +1,31 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_addc_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) + generator = rng() + a = float_values(generator, meta.elem_counts[a_name], style='signed') + b = float_values(generator, meta.elem_counts[b_name], style='signed') + c = float_values(generator, meta.elem_counts[c_name], style='signed_small') + buffers = default_buffers(meta) buffers[a_name] = a buffers[b_name] = b buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = a - b + c + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_addc_case("subc") +if __name__ == '__main__': + main() diff --git a/test/samples/Subs/subs_compare.py b/test/samples/Subs/subs_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Subs/subs_compare.py +++ b/test/samples/Subs/subs_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Subs/subs_golden.py b/test/samples/Subs/subs_golden.py index 3c8ddaab..daf6cc27 100755 --- a/test/samples/Subs/subs_golden.py +++ b/test/samples/Subs/subs_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src - np.float32(3.14) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_float_case("subs", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/Subsc/subsc_compare.py b/test/samples/Subsc/subsc_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/Subsc/subsc_compare.py +++ b/test/samples/Subsc/subsc_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/Subsc/subsc_golden.py b/test/samples/Subsc/subsc_golden.py index 7dca2d92..7d625d7e 100755 --- a/test/samples/Subsc/subsc_golden.py +++ b/test/samples/Subsc/subsc_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = src - np.float32(3.14) + src + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_scalar_carry_case("subsc", 3.14) +if __name__ == '__main__': + main() diff --git a/test/samples/VectorAddition/vadd_pto_ir_compare.py b/test/samples/VectorAddition/vadd_pto_ir_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/VectorAddition/vadd_pto_ir_compare.py +++ b/test/samples/VectorAddition/vadd_pto_ir_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/VectorAddition/vadd_pto_ir_golden.py b/test/samples/VectorAddition/vadd_pto_ir_golden.py index b193bfff..d6832130 100755 --- a/test/samples/VectorAddition/vadd_pto_ir_golden.py +++ b/test/samples/VectorAddition/vadd_pto_ir_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs + rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("add") +if __name__ == '__main__': + main() diff --git a/test/samples/VectorAddition/vectorAddition_compare.py b/test/samples/VectorAddition/vectorAddition_compare.py index 891703ea..2a923d5f 100755 --- a/test/samples/VectorAddition/vectorAddition_compare.py +++ b/test/samples/VectorAddition/vectorAddition_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - - -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.float32, 1e-4) +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0001) diff --git a/test/samples/VectorAddition/vectorAddition_golden.py b/test/samples/VectorAddition/vectorAddition_golden.py index b193bfff..d6832130 100755 --- a/test/samples/VectorAddition/vectorAddition_golden.py +++ b/test/samples/VectorAddition/vectorAddition_golden.py @@ -1,737 +1,29 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_binary_float_case(op: str): +def main(): meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) + generator = rng() + lhs = float_values(generator, meta.elem_counts[lhs_name], style='signed') + rhs = float_values(generator, meta.elem_counts[rhs_name], style='signed') + buffers = default_buffers(meta) buffers[lhs_name] = lhs buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = lhs + rhs + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_binary_float_case("add") +if __name__ == '__main__': + main() diff --git a/test/samples/Xor/xor_compare.py b/test/samples/Xor/xor_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Xor/xor_compare.py +++ b/test/samples/Xor/xor_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Xor/xor_golden.py b/test/samples/Xor/xor_golden.py index a5786585..77eb0dc1 100755 --- a/test/samples/Xor/xor_golden.py +++ b/test/samples/Xor/xor_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_xor(src, src) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_self_case("xor", np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/Xors/xors_compare.py b/test/samples/Xors/xors_compare.py index 780b65b1..6173882b 100755 --- a/test/samples/Xors/xors_compare.py +++ b/test/samples/Xors/xors_compare.py @@ -1,737 +1,14 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Dict, List - -import numpy as np - - -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_bitwise_self_case(op: str, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True +import numpy as np +from validation_runtime import compare_outputs -if __name__ == "__main__": - compare_all_outputs(np.int16, 0.0) +if __name__ == '__main__': + compare_outputs(np.int16, atol=0.0) diff --git a/test/samples/Xors/xors_golden.py b/test/samples/Xors/xors_golden.py index 91e32d92..267946bf 100755 --- a/test/samples/Xors/xors_golden.py +++ b/test/samples/Xors/xors_golden.py @@ -1,737 +1,27 @@ #!/usr/bin/python3 -# coding=utf-8 - -import os -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List - import numpy as np +from pathlib import Path +import sys +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break -SEED = 19 -ROWS = 32 -COLS = 32 - -_HOST_TYPE_TO_NP = { - "aclFloat16": np.float16, - "bfloat16_t": np.uint16, - "bool": np.bool_, - "double": np.float64, - "float": np.float32, - "half": np.float16, - "int": np.int32, - "int8_t": np.int8, - "int16_t": np.int16, - "int32_t": np.int32, - "int64_t": np.int64, - "size_t": np.uint64, - "uint8_t": np.uint8, - "uint16_t": np.uint16, - "uint32_t": np.uint32, - "uint64_t": np.uint64, - "unsigned": np.uint32, -} - - -@dataclass -class CaseMeta: - elem_counts: Dict[str, int] - np_types: Dict[str, np.dtype] - read_order: List[str] - outputs: List[str] - - @property - def inputs(self) -> List[str]: - return [name for name in self.read_order if name not in self.outputs] - - -def _host_type_to_np(host_type: str) -> np.dtype: - host_type = host_type.strip() - if host_type not in _HOST_TYPE_TO_NP: - raise KeyError(f"unsupported host type: {host_type}") - return np.dtype(_HOST_TYPE_TO_NP[host_type]) - - -def load_case_meta(main_cpp: str = "main.cpp", outputs_txt: str = "outputs.txt") -> CaseMeta: - text = Path(main_cpp).read_text(encoding="utf-8") - elem_counts = { - match.group(1): int(match.group(2)) - for match in re.finditer(r"size_t\s+elemCount_(\w+)\s*=\s*(\d+);", text) - } - np_types = { - match.group(1): _host_type_to_np(match.group(2)) - for match in re.finditer( - r"size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);", - text, - ) - } - read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) - if Path(outputs_txt).is_file(): - outputs = [line.strip() for line in Path(outputs_txt).read_text(encoding="utf-8").splitlines() if line.strip()] - else: - outputs = [] - return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) - - -def _rng(): - return np.random.default_rng(SEED) - - -def _as_matrix(arr: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - flat = np.asarray(arr).reshape(-1) - expected = rows * cols - if flat.size != expected: - raise ValueError(f"expected {expected} elements, got {flat.size}") - return flat.reshape(rows, cols) - - -def _float_values(rng, count: int, *, style: str) -> np.ndarray: - if style == "signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - elif style == "signed_small": - arr = rng.uniform(-1.5, 1.5, size=count).astype(np.float32) - elif style == "nonzero_signed": - arr = rng.uniform(-3.0, 3.0, size=count).astype(np.float32) - mask = np.abs(arr) < np.float32(0.25) - arr[mask] = np.where(arr[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) - elif style == "positive": - arr = rng.uniform(0.25, 4.0, size=count).astype(np.float32) - elif style == "exp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - elif style == "cmp": - arr = rng.uniform(-2.0, 2.0, size=count).astype(np.float32) - else: - raise ValueError(f"unsupported float style: {style}") - return arr - - -def _int_values(rng, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: - dtype = np.dtype(dtype) - if dtype == np.dtype(np.int16): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int16 style: {style}") - elif dtype == np.dtype(np.int32): - if style == "bitwise": - vals = rng.integers(-256, 256, size=count, dtype=np.int32) - elif style == "shift_small": - vals = rng.integers(0, 4, size=count, dtype=np.int32) - else: - raise ValueError(f"unsupported int32 style: {style}") - else: - raise ValueError(f"unsupported dtype/style pair: {dtype}/{style}") - return vals.astype(dtype, copy=False) - - -def _packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 - - -def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: - bits = np.asarray(bits, dtype=np.bool_) - if bits.ndim != 2: - raise ValueError("mask bits must be a 2D array") - rows, cols = bits.shape - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - out = np.zeros((rows, storage_cols), dtype=np.uint8) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 - for bit_idx in range(width): - if bits[row, base_col + bit_idx]: - word |= 1 << bit_idx - out[row, word_idx * 8:(word_idx + 1) * 8] = np.frombuffer(word.to_bytes(8, "little"), dtype=np.uint8) - return out.reshape(-1) - - -def unpack_predicate_mask(buf: np.ndarray, *, rows: int = ROWS, cols: int = COLS) -> np.ndarray: - buf = np.asarray(buf, dtype=np.uint8).reshape(-1) - if rows <= 0 or cols <= 0: - raise ValueError("rows/cols must be positive") - if buf.size % rows != 0: - raise ValueError(f"mask buffer size {buf.size} is not divisible by rows={rows}") - storage_cols = buf.size // rows - row_bytes = _packed_row_bytes(cols) - if storage_cols < row_bytes: - raise ValueError(f"storage_cols={storage_cols} is too small for cols={cols}") - packed = buf.reshape(rows, storage_cols) - bits = np.zeros((rows, cols), dtype=np.bool_) - for row in range(rows): - for word_idx, base_col in enumerate(range(0, cols, 64)): - word = int.from_bytes(bytes(packed[row, word_idx * 8:(word_idx + 1) * 8]), "little") - width = min(64, cols - base_col) - for bit_idx in range(width): - bits[row, base_col + bit_idx] = ((word >> bit_idx) & 1) != 0 - return bits - - -def _zero_buffer(meta: CaseMeta, name: str) -> np.ndarray: - return np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) - - -def _default_buffers(meta: CaseMeta) -> Dict[str, np.ndarray]: - return {name: _zero_buffer(meta, name) for name in meta.read_order} - - -def _write_buffers(meta: CaseMeta, buffers: Dict[str, np.ndarray]): - for name in meta.read_order: - if name not in buffers: - raise KeyError(f"missing buffer for {name}") - arr = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} elements, got {arr.size}") - arr.tofile(f"{name}.bin") - - -def _write_golden(meta: CaseMeta, outputs: Dict[str, np.ndarray]): - for name in meta.outputs: - if name not in outputs: - raise KeyError(f"missing golden for {name}") - arr = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) - expected = meta.elem_counts[name] - if arr.size != expected: - raise ValueError(f"{name}: expected {expected} golden elements, got {arr.size}") - arr.tofile(f"golden_{name}.bin") - - -def _single_output(meta: CaseMeta) -> str: - if len(meta.outputs) != 1: - raise ValueError(f"expected exactly one output, got {meta.outputs}") - return meta.outputs[0] - - -def generate_binary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - lhs_name, rhs_name = meta.inputs - lhs = _float_values(rng, meta.elem_counts[lhs_name], style="signed") - rhs_style = "nonzero_signed" if op in {"div", "rem"} else "signed" - rhs = _float_values(rng, meta.elem_counts[rhs_name], style=rhs_style) - buffers = _default_buffers(meta) - buffers[lhs_name] = lhs - buffers[rhs_name] = rhs - _write_buffers(meta, buffers) - - if op == "add": - out = lhs + rhs - elif op == "sub": - out = lhs - rhs - elif op == "mul": - out = lhs * rhs - elif op == "div": - out = lhs / rhs - elif op == "max": - out = np.maximum(lhs, rhs) - elif op == "min": - out = np.minimum(lhs, rhs) - elif op == "rem": - out = np.fmod(lhs, rhs) - else: - raise ValueError(f"unsupported binary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_float_case(op: str, scalar: float, *, scalar_left: bool = False): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "positive" if op in {"log", "sqrt", "rsqrt", "recip"} else "signed" - if op == "exp": - style = "exp" - if op == "cmps": - style = "cmp" - if op in {"divs", "rems"}: - style = "signed" - src = _float_values(rng, meta.elem_counts[src_name], style=style if op != "divs2" else "nonzero_signed") - if op in {"divs", "rems"}: - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - if op in {"log", "sqrt", "rsqrt", "recip"}: - src = _float_values(rng, meta.elem_counts[src_name], style="positive") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "adds": - out = src + np.float32(scalar) - elif op == "subs": - out = src - np.float32(scalar) - elif op == "muls": - out = src * np.float32(scalar) - elif op == "divs": - out = np.float32(scalar) / src if scalar_left else src / np.float32(scalar) - elif op == "maxs": - out = np.maximum(src, np.float32(scalar)) - elif op == "mins": - out = np.minimum(src, np.float32(scalar)) - elif op == "rems": - out = np.fmod(src, np.float32(scalar)) - elif op == "lrelu": - out = np.where(src > 0.0, src, src * np.float32(scalar)) - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - else: - raise ValueError(f"unsupported scalar/unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_unary_float_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "signed" - if op in {"exp"}: - style = "exp" - elif op in {"log", "sqrt", "rsqrt", "recip"}: - style = "positive" - src = _float_values(rng, meta.elem_counts[src_name], style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - - if op == "abs": - out = np.abs(src) - elif op == "neg": - out = -src - elif op == "exp": - out = np.exp(src) - elif op == "log": - out = np.log(src) - elif op == "sqrt": - out = np.sqrt(src) - elif op == "rsqrt": - out = 1.0 / np.sqrt(src) - elif op == "recip": - out = 1.0 / src - elif op == "relu": - out = np.maximum(src, np.float32(0.0)) - else: - raise ValueError(f"unsupported unary float op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_prelu_case(): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"prelu: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src_name, slope_name = meta.inputs - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - slope = _float_values(rng, meta.elem_counts[slope_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[src_name] = src - buffers[slope_name] = slope - _write_buffers(meta, buffers) - out = np.where(src > 0.0, src, src * slope) - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_addc_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"{op}: expected 3 inputs, got {meta.inputs}") - rng = _rng() - a_name, b_name, c_name = meta.inputs - a = _float_values(rng, meta.elem_counts[a_name], style="signed") - b = _float_values(rng, meta.elem_counts[b_name], style="signed") - c = _float_values(rng, meta.elem_counts[c_name], style="signed_small") - buffers = _default_buffers(meta) - buffers[a_name] = a - buffers[b_name] = b - buffers[c_name] = c - _write_buffers(meta, buffers) - if op == "addc": - out = a + b + c - elif op == "subc": - out = a - b + c - else: - raise ValueError(f"unsupported carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_scalar_carry_case(op: str, scalar: float): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "addsc": - out = src + np.float32(scalar) + src - elif op == "subsc": - out = src - np.float32(scalar) + src - else: - raise ValueError(f"unsupported scalar carry op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_row_reduce_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "rowsum": - out = src_m.sum(axis=1, dtype=np.float32) - elif op == "rowmax": - out = src_m.max(axis=1) - elif op == "rowmin": - out = src_m.min(axis=1) - else: - raise ValueError(f"unsupported row reduction op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - - -def generate_col_reduce_case(op: str, *, accumulate: bool = False): - meta = load_case_meta() - if op == "colsum": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 non-output inputs, got {meta.inputs}") - src_name, tmp_name = meta.inputs - else: - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src_name = meta.inputs[0] - tmp_name = None - rng = _rng() - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - if tmp_name is not None: - buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - out_name = _single_output(meta) - out_init = np.zeros(meta.elem_counts[out_name], dtype=meta.np_types[out_name]) - if accumulate: - out_init = _float_values(rng, meta.elem_counts[out_name], style="signed_small") - buffers[out_name] = out_init - _write_buffers(meta, buffers) - if op == "colsum": - out = src_m.sum(axis=0, dtype=np.float32) - if accumulate: - out = out + out_init - elif op == "colmax": - out = src_m.max(axis=0) - elif op == "colmin": - out = src_m.min(axis=0) - else: - raise ValueError(f"unsupported col reduction op: {op}") - _write_golden(meta, {out_name: out.astype(np.float32)}) - - -def generate_rowexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"rowexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], COLS, axis=1) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_colexpand_case(): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"colexpand: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - src = _float_values(rng, meta.elem_counts[src_name], style="signed") - src_m = _as_matrix(src) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - out = np.repeat(src_m[:1, :], ROWS, axis=0) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_rowexpand_bin_case(op: str): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="nonzero_signed" if op == "rowexpanddiv" else "signed") - src0_m = _as_matrix(src0) - src1_m = _as_matrix(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - if op == "rowexpandmul": - out = src0_m * row_scalars[:, None] - elif op == "rowexpanddiv": - out = src0_m / row_scalars[:, None] - elif op == "rowexpandsub": - out = src0_m - row_scalars[:, None] - else: - raise ValueError(f"unsupported rowexpand binary op: {op}") - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - - -def generate_expands_case(scalar: float): - meta = load_case_meta() - buffers = _default_buffers(meta) - _write_buffers(meta, buffers) - out_name = _single_output(meta) - out = np.full(meta.elem_counts[out_name], np.float32(scalar), dtype=np.float32) - _write_golden(meta, {out_name: out}) - - -def generate_cmp_case(op: str, *, scalar: float = 0.0): - meta = load_case_meta() - rng = _rng() - if op == "cmp": - if len(meta.inputs) != 2: - raise ValueError(f"{op}: expected 2 inputs, got {meta.inputs}") - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="cmp") - pred = _as_matrix(src0) < _as_matrix(src1) - elif op == "cmps": - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - src0_name = meta.inputs[0] - src0 = _float_values(rng, meta.elem_counts[src0_name], style="cmp") - src1_name = None - src1 = None - pred = _as_matrix(src0) > np.float32(scalar) - else: - raise ValueError(f"unsupported compare op: {op}") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - if src1 is not None and src1_name is not None: - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out_name = _single_output(meta) - if meta.elem_counts[out_name] % ROWS != 0: - raise ValueError(f"{out_name}: cannot derive mask storage stride from count={meta.elem_counts[out_name]}") - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) - _write_golden(meta, {out_name: packed}) - - -def generate_sel_case(): - meta = load_case_meta() - if len(meta.inputs) != 3: - raise ValueError(f"sel: expected 3 inputs, got {meta.inputs}") - rng = _rng() - mask_name, src0_name, src1_name = meta.inputs - storage_cols = meta.elem_counts[mask_name] // ROWS - mask_bits = rng.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[mask_name] = mask - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = np.where(mask_bits, _as_matrix(src0), _as_matrix(src1)) - _write_golden(meta, {_single_output(meta): out.astype(np.float32).reshape(-1)}) - +from validation_runtime import default_buffers, int_values, load_case_meta, rng, single_output, write_buffers, write_golden -def generate_sels_case(select_mode: int): - meta = load_case_meta() - if len(meta.inputs) != 2: - raise ValueError(f"sels: expected 2 inputs, got {meta.inputs}") - rng = _rng() - src0_name, src1_name = meta.inputs - src0 = _float_values(rng, meta.elem_counts[src0_name], style="signed") - src1 = _float_values(rng, meta.elem_counts[src1_name], style="signed") - buffers = _default_buffers(meta) - buffers[src0_name] = src0 - buffers[src1_name] = src1 - _write_buffers(meta, buffers) - out = src0 if int(select_mode) == 1 else src1 - _write_golden(meta, {_single_output(meta): out.astype(np.float32)}) - -def generate_bitwise_self_case(op: str, dtype: np.dtype): +def main(): meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shl", "shr"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) + [src_name] = meta.inputs + generator = rng() + src = int_values(generator, meta.elem_counts[src_name], dtype=np.int16, style='bitwise') + buffers = default_buffers(meta) buffers[src_name] = src - _write_buffers(meta, buffers) - if op == "and": - out = np.bitwise_and(src, src) - elif op == "or": - out = np.bitwise_or(src, src) - elif op == "xor": - out = np.bitwise_xor(src, src) - elif op == "shl": - out = np.left_shift(src, src) - elif op == "shr": - out = np.right_shift(src, src) - elif op == "not": - out = np.bitwise_not(src) - else: - raise ValueError(f"unsupported bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def generate_bitwise_scalar_case(op: str, scalar: int, dtype: np.dtype): - meta = load_case_meta() - if len(meta.inputs) != 1: - raise ValueError(f"{op}: expected 1 input, got {meta.inputs}") - rng = _rng() - src_name = meta.inputs[0] - style = "shift_small" if op in {"shls", "shrs"} else "bitwise" - src = _int_values(rng, meta.elem_counts[src_name], dtype, style=style) - buffers = _default_buffers(meta) - buffers[src_name] = src - _write_buffers(meta, buffers) - scalar = np.asarray(scalar, dtype=dtype).item() - if op == "ands": - out = np.bitwise_and(src, scalar) - elif op == "ors": - out = np.bitwise_or(src, scalar) - elif op == "xors": - out = np.bitwise_xor(src, scalar) - elif op == "shls": - out = np.left_shift(src, scalar) - elif op == "shrs": - out = np.right_shift(src, scalar) - else: - raise ValueError(f"unsupported scalar bitwise op: {op}") - _write_golden(meta, {_single_output(meta): np.asarray(out, dtype=dtype)}) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - golden_cmp = golden.astype(np.int64, copy=False) - output_cmp = output.astype(np.int64, copy=False) - else: - golden_cmp = golden.astype(np.float64, copy=False) - output_cmp = output.astype(np.float64, copy=False) - abs_diff = np.abs(golden_cmp - output_cmp) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={golden_cmp[idx]}, out={output_cmp[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, rows, cols): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - need = int(rows) * int(cols) - if golden.size < need or output.size < need: - print( - f"[ERROR] Packed mask buffer too small: need={need} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - golden = golden[:need].reshape(rows, cols) - output = output[:need].reshape(rows, cols) - row_bytes = min(_packed_row_bytes(cols), cols) - golden_sel = golden[:, :row_bytes].reshape(-1) - output_sel = output[:, :row_bytes].reshape(-1) - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" - ) - return False - return True - - -def compare_all_outputs(dtype, eps): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_bin(f"golden_{name}.bin", f"{name}.bin", dtype, eps) and ok - return finalize_compare(ok) - - -def compare_all_packed_mask_outputs(rows: int = ROWS, cols: int = COLS): - meta = load_case_meta() - ok = True - for name in meta.outputs: - ok = compare_packed_pred_mask(f"golden_{name}.bin", f"{name}.bin", rows, cols) and ok - return finalize_compare(ok) - + write_buffers(meta, buffers) + out = np.bitwise_xor(src, np.asarray(88, dtype=np.int16).item()) + write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.int16)}) -def finalize_compare(ok: bool): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return False - print("[INFO] compare passed") - return True -if __name__ == "__main__": - generate_bitwise_scalar_case("xors", 88, np.int16) +if __name__ == '__main__': + main() diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py new file mode 100644 index 00000000..b7f4cd10 --- /dev/null +++ b/test/samples/validation_runtime.py @@ -0,0 +1,273 @@ +#!/usr/bin/python3 +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np + +SEED = 19 +ROWS = 32 +COLS = 32 + +_HOST_TYPE_TO_NP = { + "aclFloat16": np.float16, + "bfloat16_t": np.uint16, + "bool": np.bool_, + "double": np.float64, + "float": np.float32, + "half": np.float16, + "int": np.int32, + "int8_t": np.int8, + "int16_t": np.int16, + "int32_t": np.int32, + "int64_t": np.int64, + "size_t": np.uint64, + "uint8_t": np.uint8, + "uint16_t": np.uint16, + "uint32_t": np.uint32, + "uint64_t": np.uint64, + "unsigned": np.uint32, +} + + +@dataclass +class CaseMeta: + elem_counts: Dict[str, int] + np_types: Dict[str, np.dtype] + read_order: List[str] + outputs: List[str] + + @property + def inputs(self) -> List[str]: + return [name for name in self.read_order if name not in self.outputs] + + +def load_case_meta(main_cpp: str = 'main.cpp', outputs_txt: str = 'outputs.txt') -> CaseMeta: + text = Path(main_cpp).read_text(encoding='utf-8') + elem_counts = { + match.group(1): int(match.group(2)) + for match in re.finditer(r'size_t\s+elemCount_(\w+)\s*=\s*(\d+);', text) + } + np_types = { + match.group(1): np.dtype(_HOST_TYPE_TO_NP[match.group(2).strip()]) + for match in re.finditer( + r'size_t\s+fileSize_(\w+)\s*=\s*elemCount_\1\s*\*\s*sizeof\(([^)]+)\);', + text, + ) + } + read_order = re.findall(r'ReadFile\("\./([^"]+)\.bin"', text) + outputs_path = Path(outputs_txt) + outputs = [] + if outputs_path.is_file(): + outputs = [line.strip() for line in outputs_path.read_text(encoding='utf-8').splitlines() if line.strip()] + return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) + + +def rng(): + return np.random.default_rng(SEED) + + +def float_values(generator, count: int, *, style: str) -> np.ndarray: + if style == 'signed': + values = generator.uniform(-3.0, 3.0, size=count).astype(np.float32) + elif style == 'signed_small': + values = generator.uniform(-1.5, 1.5, size=count).astype(np.float32) + elif style == 'nonzero_signed': + values = generator.uniform(-3.0, 3.0, size=count).astype(np.float32) + mask = np.abs(values) < np.float32(0.25) + values[mask] = np.where(values[mask] >= 0.0, np.float32(0.25), np.float32(-0.25)) + elif style == 'positive': + values = generator.uniform(0.25, 4.0, size=count).astype(np.float32) + elif style in {'exp', 'cmp'}: + values = generator.uniform(-2.0, 2.0, size=count).astype(np.float32) + else: + raise ValueError(f'unsupported float style: {style}') + return values + + +def int_values(generator, count: int, dtype: np.dtype, *, style: str) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.int16): + if style != 'bitwise': + raise ValueError(f'unsupported int16 style: {style}') + values = generator.integers(-256, 256, size=count, dtype=np.int32) + elif dtype == np.dtype(np.int32): + if style == 'bitwise': + values = generator.integers(-256, 256, size=count, dtype=np.int32) + elif style == 'shift_small': + values = generator.integers(0, 4, size=count, dtype=np.int32) + else: + raise ValueError(f'unsupported int32 style: {style}') + else: + raise ValueError(f'unsupported dtype/style pair: {dtype}/{style}') + return values.astype(dtype, copy=False) + + +def matrix32(values: np.ndarray, rows: int = ROWS, cols: int = COLS) -> np.ndarray: + flat = np.asarray(values).reshape(-1) + expected = rows * cols + if flat.size != expected: + raise ValueError(f'expected {expected} elements, got {flat.size}') + return flat.reshape(rows, cols) + + +def default_buffers(meta: CaseMeta): + return {name: np.zeros(meta.elem_counts[name], dtype=meta.np_types[name]) for name in meta.read_order} + + +def write_buffers(meta: CaseMeta, buffers): + for name in meta.read_order: + if name not in buffers: + raise KeyError(f'missing buffer for {name}') + array = np.asarray(buffers[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if array.size != expected: + raise ValueError(f'{name}: expected {expected} elements, got {array.size}') + array.tofile(f'{name}.bin') + + +def write_golden(meta: CaseMeta, outputs): + for name in meta.outputs: + if name not in outputs: + raise KeyError(f'missing golden for {name}') + array = np.asarray(outputs[name], dtype=meta.np_types[name]).reshape(-1) + expected = meta.elem_counts[name] + if array.size != expected: + raise ValueError(f'{name}: expected {expected} golden elements, got {array.size}') + array.tofile(f'golden_{name}.bin') + + +def single_output(meta: CaseMeta) -> str: + if len(meta.outputs) != 1: + raise ValueError(f'expected exactly one output, got {meta.outputs}') + return meta.outputs[0] + + +def packed_row_bytes(cols: int) -> int: + return ((cols + 63) // 64) * 8 + + +def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: + bits = np.asarray(bits, dtype=np.bool_) + if bits.ndim != 2: + raise ValueError('mask bits must be a 2D array') + rows, cols = bits.shape + used_cols = packed_row_bytes(cols) + if storage_cols < used_cols: + raise ValueError(f'storage_cols={storage_cols} is too small for cols={cols}') + packed = np.zeros((rows, storage_cols), dtype=np.uint8) + for row in range(rows): + for word_index, base_col in enumerate(range(0, cols, 64)): + width = min(64, cols - base_col) + word = 0 + for bit_index in range(width): + if bits[row, base_col + bit_index]: + word |= 1 << bit_index + packed[row, word_index * 8:(word_index + 1) * 8] = np.frombuffer(word.to_bytes(8, 'little'), dtype=np.uint8) + return packed.reshape(-1) + + +def _report_compare_failure(golden: np.ndarray, output: np.ndarray, golden_path: str, output_path: str): + if golden.size == 0: + print(f'[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers') + return + if np.issubdtype(golden.dtype, np.integer) or np.issubdtype(golden.dtype, np.unsignedinteger): + golden_cmp = golden.astype(np.int64, copy=False) + output_cmp = output.astype(np.int64, copy=False) + else: + golden_cmp = golden.astype(np.float64, copy=False) + output_cmp = output.astype(np.float64, copy=False) + diff = np.abs(golden_cmp - output_cmp) + index = int(np.argmax(diff)) + print( + f'[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={float(diff[index])} at idx={index} ' + f'(golden={golden_cmp[index]}, out={output_cmp[index]}, dtype={golden.dtype})' + ) + + +def compare_file(golden_path: str, output_path: str, dtype, atol: float) -> bool: + if not os.path.exists(output_path): + print(f'[ERROR] Output missing: {output_path}') + return False + if not os.path.exists(golden_path): + print(f'[ERROR] Golden missing: {golden_path}') + return False + dtype = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype) + output = np.fromfile(output_path, dtype=dtype) + if golden.shape != output.shape: + print(f'[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}') + return False + if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.unsignedinteger): + if atol == 0.0: + ok = np.array_equal(golden, output) + else: + ok = np.allclose(golden, output, atol=atol, rtol=atol) + else: + ok = np.allclose(golden, output, atol=atol, rtol=atol, equal_nan=True) + if not ok: + _report_compare_failure(golden, output, golden_path, output_path) + return False + return True + + +def compare_packed_mask_file(golden_path: str, output_path: str, *, rows: int = ROWS, cols: int = COLS) -> bool: + if not os.path.exists(output_path): + print(f'[ERROR] Output missing: {output_path}') + return False + if not os.path.exists(golden_path): + print(f'[ERROR] Golden missing: {golden_path}') + return False + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + if golden.size % rows != 0 or output.size % rows != 0: + print(f'[ERROR] Packed mask buffer size is not divisible by rows={rows}') + return False + golden_cols = golden.size // rows + output_cols = output.size // rows + used_cols = packed_row_bytes(cols) + if golden_cols < used_cols or output_cols < used_cols: + print(f'[ERROR] Packed mask storage is too small: need {used_cols} bytes per row') + return False + golden_view = golden.reshape(rows, golden_cols)[:, :used_cols].reshape(-1) + output_view = output.reshape(rows, output_cols)[:, :used_cols].reshape(-1) + if not np.array_equal(golden_view, output_view): + diff = np.nonzero(golden_view != output_view)[0] + index = int(diff[0]) if diff.size else 0 + print( + f'[ERROR] Packed mask mismatch: {golden_path} vs {output_path}, idx={index} ' + f'(golden={int(golden_view[index])}, out={int(output_view[index])})' + ) + return False + return True + + +def finalize_compare(ok: bool): + strict = os.getenv('COMPARE_STRICT', '1') != '0' + if not ok: + if strict: + print('[ERROR] compare failed') + sys.exit(2) + print('[WARN] compare failed (non-gating)') + return False + print('[INFO] compare passed') + return True + + +def compare_outputs(dtype, atol: float): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_file(f'golden_{name}.bin', f'{name}.bin', dtype, atol) and ok + return finalize_compare(ok) + + +def compare_packed_mask_outputs(*, rows: int = ROWS, cols: int = COLS): + meta = load_case_meta() + ok = True + for name in meta.outputs: + ok = compare_packed_mask_file(f'golden_{name}.bin', f'{name}.bin', rows=rows, cols=cols) and ok + return finalize_compare(ok) From 9643c7cee3078b5d3533df6ff7828fec59ec832f Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 12 Mar 2026 19:50:16 +0800 Subject: [PATCH 4/8] Skip validation helper scripts in runop --- test/samples/runop.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ee50e931..d6345d5f 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -180,6 +180,11 @@ process_one_dir() { local f mlir ptobc_file decoded_pto cpp base overall=0 for f in "$dir"/*.py; do [[ -f "$f" ]] || continue + case "$(basename "$f")" in + *_golden.py|*_compare.py|validation_runtime.py) + continue + ;; + esac base="$(basename "$f" .py)" local expect_fail=0 case "$base" in From 85c26de9cf51c53ca45a33e9410e41e059ee5a93 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Thu, 12 Mar 2026 20:44:00 +0800 Subject: [PATCH 5/8] Fix validated sample golden semantics --- test/samples/Colsum/colsum_golden.py | 10 +++------- test/samples/Divs2/divs2_golden.py | 2 +- test/samples/Rem/rem_golden.py | 2 +- test/samples/Rems/rems_golden.py | 3 ++- test/samples/Rowexpand/rowexpand_golden.py | 8 +++++--- test/samples/Rowmax/rowmax_golden.py | 6 ++---- test/samples/Rowmin/rowmin_golden.py | 6 ++---- test/samples/Rowsum/rowsum_golden.py | 6 ++---- test/samples/Rsqrt/rsqrt_compare.py | 2 +- test/samples/validation_runtime.py | 12 ++++++------ 10 files changed, 25 insertions(+), 32 deletions(-) diff --git a/test/samples/Colsum/colsum_golden.py b/test/samples/Colsum/colsum_golden.py index 2408b9f8..b148730c 100755 --- a/test/samples/Colsum/colsum_golden.py +++ b/test/samples/Colsum/colsum_golden.py @@ -18,20 +18,16 @@ def main(): generator = rng() src = float_values(generator, meta.elem_counts[src_name], style='signed') src_m = matrix32(src) - out_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') buffers = default_buffers(meta) buffers[src_name] = src buffers[tmp_name] = np.zeros(meta.elem_counts[tmp_name], dtype=meta.np_types[tmp_name]) - buffers[out_name] = out_init write_buffers(meta, buffers) reduced = np.asarray(src_m.sum(axis=0, dtype=np.float32), dtype=np.float32) - out = np.asarray(out_init, dtype=np.float32).reshape(-1).copy() + out = np.zeros(meta.elem_counts[out_name], dtype=np.float32) if out.size == ROWS * COLS: - out_m = matrix32(out) - out_m[0, :] = reduced + out_m[0, :] - out = out_m.reshape(-1) + out[:COLS] = reduced elif out.size == COLS: - out = reduced + out + out = reduced else: raise ValueError(f'unsupported colsum output size: {out.size}') write_golden(meta, {out_name: out}) diff --git a/test/samples/Divs2/divs2_golden.py b/test/samples/Divs2/divs2_golden.py index 77846158..5fb49556 100755 --- a/test/samples/Divs2/divs2_golden.py +++ b/test/samples/Divs2/divs2_golden.py @@ -19,7 +19,7 @@ def main(): buffers = default_buffers(meta) buffers[src_name] = src write_buffers(meta, buffers) - out = np.float32(3.14) / src + out = src / np.float32(3.14) write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) diff --git a/test/samples/Rem/rem_golden.py b/test/samples/Rem/rem_golden.py index f3c77d2c..f567b585 100755 --- a/test/samples/Rem/rem_golden.py +++ b/test/samples/Rem/rem_golden.py @@ -21,7 +21,7 @@ def main(): buffers[lhs_name] = lhs buffers[rhs_name] = rhs write_buffers(meta, buffers) - out = np.fmod(lhs, rhs) + out = lhs - np.floor(lhs / rhs) * rhs write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) diff --git a/test/samples/Rems/rems_golden.py b/test/samples/Rems/rems_golden.py index 80e6f564..eb66d43c 100755 --- a/test/samples/Rems/rems_golden.py +++ b/test/samples/Rems/rems_golden.py @@ -19,7 +19,8 @@ def main(): buffers = default_buffers(meta) buffers[src_name] = src write_buffers(meta, buffers) - out = np.fmod(src, np.float32(3.14)) + scalar = np.float32(3.14) + out = src - np.floor(src / scalar) * scalar write_golden(meta, {single_output(meta): np.asarray(out, dtype=np.float32)}) diff --git a/test/samples/Rowexpand/rowexpand_golden.py b/test/samples/Rowexpand/rowexpand_golden.py index 8e53085f..2f839f67 100755 --- a/test/samples/Rowexpand/rowexpand_golden.py +++ b/test/samples/Rowexpand/rowexpand_golden.py @@ -8,7 +8,7 @@ sys.path.insert(0, str(search_root)) break -from validation_runtime import default_buffers, float_values, load_case_meta, matrix32, rng, single_output, write_buffers, write_golden +from validation_runtime import COLS, ROWS, default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden def main(): @@ -16,11 +16,13 @@ def main(): [src_name] = meta.inputs generator = rng() src = float_values(generator, meta.elem_counts[src_name], style='signed') - src_m = matrix32(src) buffers = default_buffers(meta) buffers[src_name] = src write_buffers(meta, buffers) - out = np.repeat(src_m[:, :1], 32, axis=1) + row_scalars = src[::COLS][:ROWS] + if row_scalars.size != ROWS: + raise ValueError(f'rowexpand: expected at least {ROWS} row scalars, got {row_scalars.size}') + out = np.repeat(row_scalars[:, None], COLS, axis=1) write_golden(meta, {single_output(meta): out.astype(np.float32).reshape(-1)}) diff --git a/test/samples/Rowmax/rowmax_golden.py b/test/samples/Rowmax/rowmax_golden.py index 74521a93..85f590c2 100755 --- a/test/samples/Rowmax/rowmax_golden.py +++ b/test/samples/Rowmax/rowmax_golden.py @@ -22,11 +22,9 @@ def main(): buffers[src_name] = src write_buffers(meta, buffers) reduced = np.asarray(src_m.max(axis=1), dtype=np.float32) - out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + out = np.zeros(meta.elem_counts[out_name], dtype=np.float32) if out.size == ROWS * COLS: - out_m = matrix32(out) - out_m[:, 0] = reduced - out = out_m.reshape(-1) + out[:ROWS] = reduced elif out.size == ROWS: out = reduced else: diff --git a/test/samples/Rowmin/rowmin_golden.py b/test/samples/Rowmin/rowmin_golden.py index f5295a56..f93d70ef 100755 --- a/test/samples/Rowmin/rowmin_golden.py +++ b/test/samples/Rowmin/rowmin_golden.py @@ -22,11 +22,9 @@ def main(): buffers[src_name] = src write_buffers(meta, buffers) reduced = np.asarray(src_m.min(axis=1), dtype=np.float32) - out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + out = np.zeros(meta.elem_counts[out_name], dtype=np.float32) if out.size == ROWS * COLS: - out_m = matrix32(out) - out_m[:, 0] = reduced - out = out_m.reshape(-1) + out[:ROWS] = reduced elif out.size == ROWS: out = reduced else: diff --git a/test/samples/Rowsum/rowsum_golden.py b/test/samples/Rowsum/rowsum_golden.py index aed41409..b975a71e 100755 --- a/test/samples/Rowsum/rowsum_golden.py +++ b/test/samples/Rowsum/rowsum_golden.py @@ -22,11 +22,9 @@ def main(): buffers[src_name] = src write_buffers(meta, buffers) reduced = np.asarray(src_m.sum(axis=1, dtype=np.float32), dtype=np.float32) - out = np.asarray(buffers.get(out_name, np.zeros(meta.elem_counts[out_name], dtype=np.float32)), dtype=np.float32).reshape(-1).copy() + out = np.zeros(meta.elem_counts[out_name], dtype=np.float32) if out.size == ROWS * COLS: - out_m = matrix32(out) - out_m[:, 0] = reduced - out = out_m.reshape(-1) + out[:ROWS] = reduced elif out.size == ROWS: out = reduced else: diff --git a/test/samples/Rsqrt/rsqrt_compare.py b/test/samples/Rsqrt/rsqrt_compare.py index 03205d0a..870671d3 100755 --- a/test/samples/Rsqrt/rsqrt_compare.py +++ b/test/samples/Rsqrt/rsqrt_compare.py @@ -11,4 +11,4 @@ from validation_runtime import compare_outputs if __name__ == '__main__': - compare_outputs(np.float32, atol=0.001) + compare_outputs(np.float32, atol=0.005) diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py index b7f4cd10..94096352 100644 --- a/test/samples/validation_runtime.py +++ b/test/samples/validation_runtime.py @@ -147,7 +147,7 @@ def single_output(meta: CaseMeta) -> str: def packed_row_bytes(cols: int) -> int: - return ((cols + 63) // 64) * 8 + return (cols + 7) // 8 def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: @@ -160,13 +160,13 @@ def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: raise ValueError(f'storage_cols={storage_cols} is too small for cols={cols}') packed = np.zeros((rows, storage_cols), dtype=np.uint8) for row in range(rows): - for word_index, base_col in enumerate(range(0, cols, 64)): - width = min(64, cols - base_col) - word = 0 + for byte_index, base_col in enumerate(range(0, cols, 8)): + width = min(8, cols - base_col) + packed_byte = 0 for bit_index in range(width): if bits[row, base_col + bit_index]: - word |= 1 << bit_index - packed[row, word_index * 8:(word_index + 1) * 8] = np.frombuffer(word.to_bytes(8, 'little'), dtype=np.uint8) + packed_byte |= 1 << bit_index + packed[row, byte_index] = packed_byte return packed.reshape(-1) From de4d6d7fd6cae764733e0ff61acd53d44d7d936a Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Fri, 13 Mar 2026 09:46:12 +0800 Subject: [PATCH 6/8] Fix packed mask sample golden generation --- test/samples/Cmp/cmp_golden.py | 15 ++++++++++++--- test/samples/Cmps/cmps_golden.py | 15 ++++++++++++--- test/samples/Sel/sel_golden.py | 19 ++++++++++++++++--- test/samples/validation_runtime.py | 23 +++++++++++++++++++++++ 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/test/samples/Cmp/cmp_golden.py b/test/samples/Cmp/cmp_golden.py index d0c7dd3b..48639b6f 100755 --- a/test/samples/Cmp/cmp_golden.py +++ b/test/samples/Cmp/cmp_golden.py @@ -8,7 +8,17 @@ sys.path.insert(0, str(search_root)) break -from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden +from validation_runtime import ( + default_buffers, + float_values, + load_case_meta, + matrix32, + pack_predicate_mask_for_buffer, + rng, + single_output, + write_buffers, + write_golden, +) def main(): @@ -23,8 +33,7 @@ def main(): buffers[src1_name] = src1 write_buffers(meta, buffers) out_name = single_output(meta) - storage_cols = meta.elem_counts[out_name] // ROWS - packed = pack_predicate_mask(pred, storage_cols=storage_cols) + packed = pack_predicate_mask_for_buffer(pred, elem_count=meta.elem_counts[out_name], dtype=meta.np_types[out_name]) write_golden(meta, {out_name: packed}) diff --git a/test/samples/Cmps/cmps_golden.py b/test/samples/Cmps/cmps_golden.py index 301f8bcc..c9172a52 100755 --- a/test/samples/Cmps/cmps_golden.py +++ b/test/samples/Cmps/cmps_golden.py @@ -8,7 +8,17 @@ sys.path.insert(0, str(search_root)) break -from validation_runtime import ROWS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden +from validation_runtime import ( + default_buffers, + float_values, + load_case_meta, + matrix32, + pack_predicate_mask_for_buffer, + rng, + single_output, + write_buffers, + write_golden, +) def main(): @@ -21,8 +31,7 @@ def main(): buffers[src_name] = src write_buffers(meta, buffers) out_name = single_output(meta) - storage_cols = meta.elem_counts[out_name] // 32 - packed = pack_predicate_mask(pred, storage_cols=storage_cols) + packed = pack_predicate_mask_for_buffer(pred, elem_count=meta.elem_counts[out_name], dtype=meta.np_types[out_name]) write_golden(meta, {out_name: packed}) diff --git a/test/samples/Sel/sel_golden.py b/test/samples/Sel/sel_golden.py index 880988bb..d561ac4a 100755 --- a/test/samples/Sel/sel_golden.py +++ b/test/samples/Sel/sel_golden.py @@ -8,16 +8,29 @@ sys.path.insert(0, str(search_root)) break -from validation_runtime import ROWS, COLS, default_buffers, float_values, load_case_meta, matrix32, pack_predicate_mask, rng, single_output, write_buffers, write_golden +from validation_runtime import ( + ROWS, + COLS, + default_buffers, + float_values, + load_case_meta, + matrix32, + pack_predicate_mask_for_buffer, + rng, + single_output, + write_buffers, + write_golden, +) def main(): meta = load_case_meta() mask_name, src0_name, src1_name = meta.inputs generator = rng() - storage_cols = meta.elem_counts[mask_name] // ROWS mask_bits = generator.integers(0, 2, size=(ROWS, COLS), dtype=np.uint8).astype(np.bool_) - mask = pack_predicate_mask(mask_bits, storage_cols=storage_cols) + mask = pack_predicate_mask_for_buffer( + mask_bits, elem_count=meta.elem_counts[mask_name], dtype=meta.np_types[mask_name] + ) src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') src1 = float_values(generator, meta.elem_counts[src1_name], style='signed') buffers = default_buffers(meta) diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py index 94096352..76befb58 100644 --- a/test/samples/validation_runtime.py +++ b/test/samples/validation_runtime.py @@ -150,6 +150,17 @@ def packed_row_bytes(cols: int) -> int: return (cols + 7) // 8 +def packed_mask_storage_bytes(elem_count: int, dtype) -> int: + return int(elem_count) * np.dtype(dtype).itemsize + + +def packed_mask_storage_cols(*, elem_count: int, dtype, rows: int = ROWS) -> int: + storage_bytes = packed_mask_storage_bytes(elem_count, dtype) + if storage_bytes % rows != 0: + raise ValueError(f'packed mask storage {storage_bytes} bytes is not divisible by rows={rows}') + return storage_bytes // rows + + def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: bits = np.asarray(bits, dtype=np.bool_) if bits.ndim != 2: @@ -170,6 +181,18 @@ def pack_predicate_mask(bits: np.ndarray, *, storage_cols: int) -> np.ndarray: return packed.reshape(-1) +def pack_predicate_mask_for_buffer(bits: np.ndarray, *, elem_count: int, dtype, rows: int = ROWS) -> np.ndarray: + dtype = np.dtype(dtype) + storage_cols = packed_mask_storage_cols(elem_count=elem_count, dtype=dtype, rows=rows) + packed_bytes = pack_predicate_mask(bits, storage_cols=storage_cols) + expected_bytes = packed_mask_storage_bytes(elem_count, dtype) + if packed_bytes.nbytes != expected_bytes: + raise ValueError( + f'packed mask byte size mismatch: expected {expected_bytes}, got {packed_bytes.nbytes}' + ) + return np.frombuffer(packed_bytes.tobytes(), dtype=dtype).copy() + + def _report_compare_failure(golden: np.ndarray, output: np.ndarray, golden_path: str, output_path: str): if golden.size == 0: print(f'[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers') From 6e26a8d639b88364a8f74ffb7d0f5be13fedad34 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Fri, 13 Mar 2026 10:36:12 +0800 Subject: [PATCH 7/8] Add shape-focused standalone sample goldens --- test/samples/AddPtr/addptr_chain_compare.py | 14 +++++++ test/samples/AddPtr/addptr_chain_golden.py | 31 +++++++++++++++ test/samples/AddPtr/addptr_compare.py | 14 +++++++ test/samples/AddPtr/addptr_f16_compare.py | 14 +++++++ test/samples/AddPtr/addptr_f16_golden.py | 31 +++++++++++++++ test/samples/AddPtr/addptr_golden.py | 31 +++++++++++++++ test/samples/Ci/ci_compare.py | 14 +++++++ test/samples/Ci/ci_golden.py | 27 +++++++++++++ test/samples/Fillpad/fillpad_compare.py | 14 +++++++ .../samples/Fillpad/fillpad_expand_compare.py | 14 +++++++ test/samples/Fillpad/fillpad_expand_golden.py | 34 ++++++++++++++++ test/samples/Fillpad/fillpad_golden.py | 29 ++++++++++++++ .../tensor_view_infer_layout_dn_compare.py | 14 +++++++ .../tensor_view_infer_layout_dn_golden.py | 29 ++++++++++++++ .../Layout/tensor_view_layout_dn_compare.py | 14 +++++++ .../Layout/tensor_view_layout_dn_golden.py | 29 ++++++++++++++ .../Partition5D/partition5d_a5_compare.py | 14 +++++++ .../Partition5D/partition5d_a5_golden.py | 39 +++++++++++++++++++ .../Partition5D/partition5d_compare.py | 14 +++++++ .../samples/Partition5D/partition5d_golden.py | 39 +++++++++++++++++++ .../Sync/test_dynamic_valid_shape_compare.py | 14 +++++++ .../Sync/test_dynamic_valid_shape_golden.py | 29 ++++++++++++++ test/samples/validation_runtime.py | 10 +++++ 23 files changed, 512 insertions(+) create mode 100644 test/samples/AddPtr/addptr_chain_compare.py create mode 100644 test/samples/AddPtr/addptr_chain_golden.py create mode 100644 test/samples/AddPtr/addptr_compare.py create mode 100644 test/samples/AddPtr/addptr_f16_compare.py create mode 100644 test/samples/AddPtr/addptr_f16_golden.py create mode 100644 test/samples/AddPtr/addptr_golden.py create mode 100644 test/samples/Ci/ci_compare.py create mode 100644 test/samples/Ci/ci_golden.py create mode 100644 test/samples/Fillpad/fillpad_compare.py create mode 100644 test/samples/Fillpad/fillpad_expand_compare.py create mode 100644 test/samples/Fillpad/fillpad_expand_golden.py create mode 100644 test/samples/Fillpad/fillpad_golden.py create mode 100644 test/samples/Layout/tensor_view_infer_layout_dn_compare.py create mode 100644 test/samples/Layout/tensor_view_infer_layout_dn_golden.py create mode 100644 test/samples/Layout/tensor_view_layout_dn_compare.py create mode 100644 test/samples/Layout/tensor_view_layout_dn_golden.py create mode 100644 test/samples/Partition5D/partition5d_a5_compare.py create mode 100644 test/samples/Partition5D/partition5d_a5_golden.py create mode 100644 test/samples/Partition5D/partition5d_compare.py create mode 100644 test/samples/Partition5D/partition5d_golden.py create mode 100644 test/samples/Sync/test_dynamic_valid_shape_compare.py create mode 100644 test/samples/Sync/test_dynamic_valid_shape_golden.py diff --git a/test/samples/AddPtr/addptr_chain_compare.py b/test/samples/AddPtr/addptr_chain_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/AddPtr/addptr_chain_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/AddPtr/addptr_chain_golden.py b/test/samples/AddPtr/addptr_chain_golden.py new file mode 100644 index 00000000..49f5ea2f --- /dev/null +++ b/test/samples/AddPtr/addptr_chain_golden.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + offset = 24 + out = src[offset:offset + meta.elem_counts[out_name]] + write_golden(meta, {out_name: np.asarray(out, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/AddPtr/addptr_compare.py b/test/samples/AddPtr/addptr_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/AddPtr/addptr_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/AddPtr/addptr_f16_compare.py b/test/samples/AddPtr/addptr_f16_compare.py new file mode 100644 index 00000000..0c36d972 --- /dev/null +++ b/test/samples/AddPtr/addptr_f16_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float16, atol=0.0) diff --git a/test/samples/AddPtr/addptr_f16_golden.py b/test/samples/AddPtr/addptr_f16_golden.py new file mode 100644 index 00000000..0e2b25ed --- /dev/null +++ b/test/samples/AddPtr/addptr_f16_golden.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed').astype(np.float16) + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small').astype(np.float16) + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + offset = 32 + out = src[offset:offset + meta.elem_counts[out_name]] + write_golden(meta, {out_name: np.asarray(out, dtype=np.float16)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/AddPtr/addptr_golden.py b/test/samples/AddPtr/addptr_golden.py new file mode 100644 index 00000000..b4d640d3 --- /dev/null +++ b/test/samples/AddPtr/addptr_golden.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + offset = 32 + out = src[offset:offset + meta.elem_counts[out_name]] + write_golden(meta, {out_name: np.asarray(out, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Ci/ci_compare.py b/test/samples/Ci/ci_compare.py new file mode 100644 index 00000000..b93e8b9e --- /dev/null +++ b/test/samples/Ci/ci_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.int32, atol=0.0) diff --git a/test/samples/Ci/ci_golden.py b/test/samples/Ci/ci_golden.py new file mode 100644 index 00000000..93976ed3 --- /dev/null +++ b/test/samples/Ci/ci_golden.py @@ -0,0 +1,27 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, load_case_meta, load_int32_assignments, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + out_name = single_output(meta) + start, = load_int32_assignments() + buffers = default_buffers(meta) + buffers[out_name] = np.full(meta.elem_counts[out_name], -123, dtype=np.int32) + write_buffers(meta, buffers) + cols = meta.elem_counts[out_name] + out = np.asarray([start - index for index in range(cols)], dtype=np.int32) + write_golden(meta, {out_name: out}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Fillpad/fillpad_compare.py b/test/samples/Fillpad/fillpad_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Fillpad/fillpad_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Fillpad/fillpad_expand_compare.py b/test/samples/Fillpad/fillpad_expand_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Fillpad/fillpad_expand_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Fillpad/fillpad_expand_golden.py b/test/samples/Fillpad/fillpad_expand_golden.py new file mode 100644 index 00000000..4a407b69 --- /dev/null +++ b/test/samples/Fillpad/fillpad_expand_golden.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + src_rows = 32 + src_cols = meta.elem_counts[src_name] // src_rows + dst_cols = meta.elem_counts[out_name] // src_rows + out = np.zeros((src_rows, dst_cols), dtype=np.float32) + out[:, :src_cols] = np.asarray(src, dtype=np.float32).reshape(src_rows, src_cols) + write_golden(meta, {out_name: out.reshape(-1)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Fillpad/fillpad_golden.py b/test/samples/Fillpad/fillpad_golden.py new file mode 100644 index 00000000..1484e1e2 --- /dev/null +++ b/test/samples/Fillpad/fillpad_golden.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + write_golden(meta, {out_name: np.asarray(src, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Layout/tensor_view_infer_layout_dn_compare.py b/test/samples/Layout/tensor_view_infer_layout_dn_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Layout/tensor_view_infer_layout_dn_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Layout/tensor_view_infer_layout_dn_golden.py b/test/samples/Layout/tensor_view_infer_layout_dn_golden.py new file mode 100644 index 00000000..1484e1e2 --- /dev/null +++ b/test/samples/Layout/tensor_view_infer_layout_dn_golden.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + write_golden(meta, {out_name: np.asarray(src, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Layout/tensor_view_layout_dn_compare.py b/test/samples/Layout/tensor_view_layout_dn_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Layout/tensor_view_layout_dn_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Layout/tensor_view_layout_dn_golden.py b/test/samples/Layout/tensor_view_layout_dn_golden.py new file mode 100644 index 00000000..1484e1e2 --- /dev/null +++ b/test/samples/Layout/tensor_view_layout_dn_golden.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + write_golden(meta, {out_name: np.asarray(src, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Partition5D/partition5d_a5_compare.py b/test/samples/Partition5D/partition5d_a5_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Partition5D/partition5d_a5_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Partition5D/partition5d_a5_golden.py b/test/samples/Partition5D/partition5d_a5_golden.py new file mode 100644 index 00000000..1ecb70fa --- /dev/null +++ b/test/samples/Partition5D/partition5d_a5_golden.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + + src_array = np.asarray(src, dtype=np.float32) + out_array = np.asarray(dst_init, dtype=np.float32).copy() + touched = [] + for depth in range(16): + for row in range(16): + base = depth * 1048576 + row * 1024 + touched.extend(base + col for col in range(16)) + touched = np.asarray(touched, dtype=np.int64) + out_array[touched] = src_array[touched] + write_golden(meta, {out_name: out_array}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Partition5D/partition5d_compare.py b/test/samples/Partition5D/partition5d_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Partition5D/partition5d_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Partition5D/partition5d_golden.py b/test/samples/Partition5D/partition5d_golden.py new file mode 100644 index 00000000..1ecb70fa --- /dev/null +++ b/test/samples/Partition5D/partition5d_golden.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + + src_array = np.asarray(src, dtype=np.float32) + out_array = np.asarray(dst_init, dtype=np.float32).copy() + touched = [] + for depth in range(16): + for row in range(16): + base = depth * 1048576 + row * 1024 + touched.extend(base + col for col in range(16)) + touched = np.asarray(touched, dtype=np.int64) + out_array[touched] = src_array[touched] + write_golden(meta, {out_name: out_array}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/Sync/test_dynamic_valid_shape_compare.py b/test/samples/Sync/test_dynamic_valid_shape_compare.py new file mode 100644 index 00000000..6764bd08 --- /dev/null +++ b/test/samples/Sync/test_dynamic_valid_shape_compare.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import compare_outputs + +if __name__ == '__main__': + compare_outputs(np.float32, atol=0.0) diff --git a/test/samples/Sync/test_dynamic_valid_shape_golden.py b/test/samples/Sync/test_dynamic_valid_shape_golden.py new file mode 100644 index 00000000..82e8ff15 --- /dev/null +++ b/test/samples/Sync/test_dynamic_valid_shape_golden.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +import numpy as np +from pathlib import Path +import sys + +for search_root in (Path(__file__).resolve().parent, Path(__file__).resolve().parents[1]): + if (search_root / 'validation_runtime.py').is_file(): + sys.path.insert(0, str(search_root)) + break + +from validation_runtime import default_buffers, float_values, load_case_meta, rng, single_output, write_buffers, write_golden + + +def main(): + meta = load_case_meta() + src_name = meta.inputs[0] + out_name = single_output(meta) + generator = rng() + src = float_values(generator, meta.elem_counts[src_name], style='signed') + dst_init = float_values(generator, meta.elem_counts[out_name], style='signed_small') + buffers = default_buffers(meta) + buffers[src_name] = src + buffers[out_name] = dst_init + write_buffers(meta, buffers) + write_golden(meta, {out_name: np.asarray(dst_init, dtype=np.float32)}) + + +if __name__ == '__main__': + main() diff --git a/test/samples/validation_runtime.py b/test/samples/validation_runtime.py index 76befb58..8b02b843 100644 --- a/test/samples/validation_runtime.py +++ b/test/samples/validation_runtime.py @@ -66,6 +66,16 @@ def load_case_meta(main_cpp: str = 'main.cpp', outputs_txt: str = 'outputs.txt') return CaseMeta(elem_counts=elem_counts, np_types=np_types, read_order=read_order, outputs=outputs) +def load_scalar_assignments(ctype: str, main_cpp: str = 'main.cpp') -> List[int]: + text = Path(main_cpp).read_text(encoding='utf-8') + pattern = rf'{re.escape(ctype)}\s+\w+\s*=\s*(-?\d+);' + return [int(value) for value in re.findall(pattern, text)] + + +def load_int32_assignments(main_cpp: str = 'main.cpp') -> List[int]: + return load_scalar_assignments('int32_t', main_cpp=main_cpp) + + def rng(): return np.random.default_rng(SEED) From ea52e7d460c13dce0d502a85a3d0ae0da224c733 Mon Sep 17 00:00:00 2001 From: HecreReed <821896444@qq.com> Date: Fri, 13 Mar 2026 12:00:43 +0800 Subject: [PATCH 8/8] Fix A3 remote validation regressions --- .github/workflows/ci.yml | 2 +- lib/PTO/IR/PTO.cpp | 60 +++- lib/PTO/Transforms/PTOToEmitC.cpp | 271 ++++++++++++++++-- test/samples/Prelu/prelu.py | 4 +- test/samples/Rowexpanddiv/rowexpanddiv.py | 12 +- .../Rowexpanddiv/rowexpanddiv_golden.py | 8 +- test/samples/Rowexpandmul/rowexpandmul.py | 10 +- .../Rowexpandmul/rowexpandmul_golden.py | 8 +- test/samples/Rowexpandsub/rowexpandsub.py | 12 +- .../Rowexpandsub/rowexpandsub_golden.py | 8 +- 10 files changed, 342 insertions(+), 53 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5074c19..afc92bac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -279,7 +279,7 @@ jobs: # suite (RUN_ONLY_CASES is empty), skip the non-matching variant based # on SOC_VERSION to keep the remote validation portable. A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync" - A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5" + A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,plan_memory_scopes_independent" sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')" is_a5=0 diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 02670518..5ac57919 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -2566,7 +2566,8 @@ mlir::LogicalResult mlir::pto::TPReluOp::verify() { Type e0 = getElemTy(t0), e1 = getElemTy(t1), et = getElemTy(tt), ed = getElemTy(td); if (!e0 || !e1 || !et || !ed) return emitOpError("failed to get element type for operands"); - // TPRELU C++ API (TPreluCheck): dst/src0/src1 same type (half or float); tmp must be uint8_t. + // TPRELU C++ API (TPreluCheck): dst/src0/src1 same type (half or float); + // tmp must be uint8_t. if (e0 != e1 || e0 != ed) return emitOpError("expects src0/src1/dst to have the same element type (f16 or f32)"); if (!e0.isa() || (!e0.isF16() && !e0.isF32())) @@ -2793,6 +2794,42 @@ mlir::LogicalResult mlir::pto::TRowExpandOp::verify() { return emitOpError("expects src/dst to have the same element type"); return mlir::success(); } + +static mlir::LogicalResult verifyTRowExpandLike(Operation *op, Type src0Ty, + Type src1Ty, Type dstTy, + Type elemTy) { + auto s0 = getShapeVec(src0Ty), s1 = getShapeVec(src1Ty), sd = getShapeVec(dstTy); + if (s0.size() != 2 || s1.size() != 2 || sd.size() != 2) + return op->emitError("expects rank-2 shaped operands"); + auto elemBytes = getElemBytes(elemTy); + if (!elemBytes || *elemBytes <= 0) + return op->emitError("unsupported element type"); + + int64_t blockCols = 32 / *elemBytes; + auto isPerRowShape = [&](ArrayRef shape) { + return shape[0] == sd[0] && (shape[1] == 1 || shape[1] == blockCols); + }; + + bool src0EqDst = s0 == sd; + bool src1EqDst = s1 == sd; + if (src0EqDst == src1EqDst) + return op->emitError( + "expects exactly one source to match dst shape and the other to provide one scalar per row"); + + if (src0EqDst) { + if (!isPerRowShape(s1)) + return op->emitError() << "expects src1 shape to be [" << sd[0] + << " x 1] or [" << sd[0] << " x " << blockCols + << "] when src0 matches dst"; + return success(); + } + + if (!isPerRowShape(s0)) + return op->emitError() << "expects src0 shape to be [" << sd[0] + << " x 1] or [" << sd[0] << " x " << blockCols + << "] when src1 matches dst"; + return success(); +} //===----------------------------------------------------------------------===// // PTO.cpp (add verifier for TROWEXPANDDIV DPS/tilebuf op) //===----------------------------------------------------------------------===// @@ -2811,7 +2848,7 @@ mlir::LogicalResult mlir::pto::TRowExpandDivOp::verify() { auto elemTy = e0.dyn_cast(); if (!elemTy || (!elemTy.isF16() && !elemTy.isF32())) return emitOpError("expects element type to be f16 or f32"); - return mlir::success(); + return verifyTRowExpandLike(getOperation(), t0, t1, td, e0); } //===----------------------------------------------------------------------===// // PTO.cpp (add verifier for TROWEXPANDMUL DPS/tilebuf op) @@ -2831,7 +2868,7 @@ mlir::LogicalResult mlir::pto::TRowExpandMulOp::verify() { auto ft = e0.dyn_cast(); if (!ft || (!ft.isF16() && !ft.isF32())) return emitOpError("expects element type to be f16 or f32"); - return mlir::success(); + return verifyTRowExpandLike(getOperation(), t0, t1, td, e0); } //===----------------------------------------------------------------------===// // PTO.cpp (add verifier for TROWEXPANDSUB DPS/tilebuf op) @@ -2851,7 +2888,7 @@ mlir::LogicalResult mlir::pto::TRowExpandSubOp::verify() { auto ft = e0.dyn_cast(); if (!ft || (!ft.isF16() && !ft.isF32())) return emitOpError("expects element type to be f16 or f32"); - return mlir::success(); + return verifyTRowExpandLike(getOperation(), t0, t1, td, e0); } //===----------------------------------------------------------------------===// // PTO.cpp (add verifier for TROWMAX DPS/tilebuf op) @@ -3013,19 +3050,22 @@ mlir::LogicalResult mlir::pto::TSelSOp::verify() { Type td = getDst().getType(); if (!isPTOShapedLike(t0) || !isPTOShapedLike(t1) || !isPTOShapedLike(td)) return emitOpError("expects src0/src1/dst to be memref/tensor/tile_buf/tile_view types"); - Type es = getElemTy(t0), ed = getElemTy(td); - if (!es || !ed) + Type e0 = getElemTy(t0), e1 = getElemTy(t1), ed = getElemTy(td); + if (!e0 || !e1 || !ed) return emitOpError("failed to get element type for operands"); - if (es != ed) - return emitOpError("expects src0 and dst to have the same element type"); + if (e0 != e1 || e0 != ed) + return emitOpError("expects src0/src1/dst to have the same element type"); + auto s0 = getShapeVec(t0), s1 = getShapeVec(t1), sd = getShapeVec(td); + if (s0 != s1 || s0 != sd) + return emitOpError("expects src0/src1/dst to have the same shape"); auto isAllowedElem = [&](mlir::Type t) -> bool { if (t.isF16() || t.isF32() || t.isBF16()) return true; if (auto it = mlir::dyn_cast(t)) return (it.getWidth() == 8 || it.getWidth() == 16 || it.getWidth() == 32); return false; }; - if (!isAllowedElem(es)) - return emitOpError("expects src0 and dst element type to be i8/i16/i32/f16/bf16/f32"); + if (!isAllowedElem(e0)) + return emitOpError("expects src0/src1/dst element type to be i8/i16/i32/f16/bf16/f32"); return mlir::success(); } //===----------------------------------------------------------------------===// diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 939287dd..943ff0b1 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -1686,6 +1686,193 @@ static Value makeEmitCIntConstant(ConversionPatternRewriter &rewriter, return makeEmitCOpaqueConstant(rewriter, loc, type, std::to_string(value)); } +static bool isEmitCTileOpaqueType(Type type) { + auto opaqueType = dyn_cast(type); + if (!opaqueType) + return false; + StringRef typeName = opaqueType.getValue(); + return typeName.starts_with("Tile<") && typeName.ends_with(">"); +} + +static LogicalResult parseEmitCTileType(Value exemplar, + SmallVectorImpl &parts) { + auto tileTy = dyn_cast(exemplar.getType()); + if (!tileTy) + return failure(); + + StringRef tileTypeStr = tileTy.getValue(); + if (!tileTypeStr.starts_with("Tile<") || !tileTypeStr.ends_with(">")) + return failure(); + + StringRef body = tileTypeStr.drop_front(5).drop_back(1); + size_t partBegin = 0; + int angleDepth = 0; + for (size_t i = 0; i < body.size(); ++i) { + char c = body[i]; + if (c == '<') { + ++angleDepth; + } else if (c == '>') { + if (angleDepth > 0) + --angleDepth; + } else if (c == ',' && angleDepth == 0) { + parts.push_back(body.slice(partBegin, i).trim().str()); + partBegin = i + 1; + } + } + parts.push_back(body.drop_front(partBegin).trim().str()); + return success(parts.size() >= 10); +} + +static FailureOr getEmitCTileStorageSizeBytes(Value exemplar) { + SmallVector parts; + if (failed(parseEmitCTileType(exemplar, parts))) + return failure(); + + int64_t rows = 0; + int64_t cols = 0; + if (StringRef(parts[2]).trim().getAsInteger(10, rows) || + StringRef(parts[3]).trim().getAsInteger(10, cols)) + return failure(); + + int64_t elemBytes = 0; + StringRef elemTok = StringRef(parts[1]).trim(); + if (elemTok == "half" || elemTok == "float16_t" || elemTok == "bfloat16_t" || + elemTok == "int16_t" || elemTok == "uint16_t") { + elemBytes = 2; + } else if (elemTok == "float" || elemTok == "int32_t" || + elemTok == "uint32_t") { + elemBytes = 4; + } else if (elemTok == "double" || elemTok == "int64_t" || + elemTok == "uint64_t") { + elemBytes = 8; + } else if (elemTok == "int8_t" || elemTok == "uint8_t" || + elemTok == "bool") { + elemBytes = 1; + } else { + return failure(); + } + + return rows * cols * elemBytes; +} + +static FailureOr +buildIntegralAddressFromTileLike(Location loc, Value sourceValue, + ConversionPatternRewriter &rewriter) { + auto *ctx = rewriter.getContext(); + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + auto rcU64 = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "uint64_t")}); + + if (auto castOp = sourceValue.getDefiningOp()) + sourceValue = castOp.getOperand(); + + Value rawPtr = sourceValue; + if (isEmitCTileOpaqueType(sourceValue.getType())) { + SmallVector parts; + if (failed(parseEmitCTileType(sourceValue, parts))) + return failure(); + + StringRef roleTok = StringRef(parts[0]).trim(); + StringRef elemTok = StringRef(parts[1]).trim(); + std::string qualifier; + if (roleTok == "TileType::Vec") { + qualifier = "__ubuf__"; + } else if (roleTok == "TileType::Mat") { + qualifier = "__cbuf__"; + } else if (roleTok == "TileType::Left") { + qualifier = "__ca__"; + } else if (roleTok == "TileType::Right") { + qualifier = "__cb__"; + } else if (roleTok == "TileType::Acc") { + qualifier = "__cc__"; + } else if (roleTok == "TileType::Scaling") { + qualifier = "__fbuf__"; + } else { + qualifier = "__gm__"; + } + + auto rawPtrTy = + emitc::OpaqueType::get(ctx, qualifier + " " + elemTok.str() + "*"); + rawPtr = rewriter + .create(loc, rawPtrTy, "PTOAS__TILE_DATA", + ArrayAttr{}, ArrayAttr{}, + ValueRange{sourceValue}) + .getResult(0); + } + + if (isa(rawPtr.getType()) || + (isa(rawPtr.getType()) && + cast(rawPtr.getType()).getValue().ends_with("*"))) { + return rewriter + .create(loc, u64Ty, "reinterpret_cast", + ArrayAttr{}, rcU64, ValueRange{rawPtr}) + .getResult(0); + } + + if (rawPtr.getType() == u64Ty) + return rawPtr; + return rewriter.create(loc, u64Ty, rawPtr).getResult(); +} + +static FailureOr +createSiblingTileValue(Location loc, Value exemplar, + ConversionPatternRewriter &rewriter) { + auto opaqueTy = dyn_cast(exemplar.getType()); + if (!opaqueTy || !isEmitCTileOpaqueType(opaqueTy)) + return failure(); + + SmallVector parts; + if (failed(parseEmitCTileType(exemplar, parts))) + return failure(); + if (StringRef(parts[5]).trim() == "-1" || StringRef(parts[6]).trim() == "-1") + return failure(); + + return rewriter + .create(loc, opaqueTy, + emitc::OpaqueAttr::get(rewriter.getContext(), "")) + .getResult(); +} + +static FailureOr +materializeDisjointTempTile(Location loc, Value exemplar, + ArrayRef occupiedValues, + ConversionPatternRewriter &rewriter) { + if (occupiedValues.empty()) + return failure(); + + auto tmpTile = createSiblingTileValue(loc, exemplar, rewriter); + auto tileBytes = getEmitCTileStorageSizeBytes(exemplar); + if (failed(tmpTile) || failed(tileBytes)) + return failure(); + + auto *ctx = rewriter.getContext(); + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + SmallVector addrs; + addrs.reserve(occupiedValues.size()); + for (Value value : occupiedValues) { + auto addr = buildIntegralAddressFromTileLike(loc, value, rewriter); + if (failed(addr)) + return failure(); + addrs.push_back(*addr); + } + + Value maxAddr = addrs.front(); + for (Value addr : ArrayRef(addrs).drop_front()) { + Value isLess = rewriter.create( + loc, rewriter.getI1Type(), emitc::CmpPredicate::lt, maxAddr, addr); + maxAddr = rewriter.create(loc, u64Ty, isLess, addr, + maxAddr); + } + + Value tmpAddr = rewriter.create( + loc, u64Ty, maxAddr, + makeEmitCIntConstant(rewriter, loc, u64Ty, *tileBytes)); + rewriter.create(loc, TypeRange{}, "TASSIGN", ArrayAttr{}, + ArrayAttr{}, + ValueRange{*tmpTile, tmpAddr}); + return *tmpTile; +} + static Value emitCCast(ConversionPatternRewriter &rewriter, Location loc, Type dstType, Value src) { if (src.getType() == dstType) @@ -5420,18 +5607,44 @@ struct PTOPreluToEmitC : public OpConversionPattern { LogicalResult matchAndRewrite(pto::TPReluOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); + auto *ctx = rewriter.getContext(); Value src0 = peelUnrealized(adaptor.getSrc0()); Value src1 = peelUnrealized(adaptor.getSrc1()); - Value tmp = peelUnrealized(adaptor.getTmp()); Value dst = peelUnrealized(adaptor.getDst()); - - // C++ interface: TPRELU(dst, src0, src1, tmp) — last parameter is tmp. - SmallVector operands{dst, src0, src1, tmp}; - rewriter.create( - loc, TypeRange{}, "TPRELU", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); + auto tmp = materializeDisjointTempTile(loc, dst, {src0, src1, dst}, rewriter); + if (failed(tmp)) + return op.emitOpError("failed to materialize disjoint temp tile"); + + SmallVector parts; + if (failed(parseEmitCTileType(dst, parts))) + return op.emitOpError("failed to infer element type for tprelu lowering"); + + auto scalarTy = + emitc::OpaqueType::get(ctx, StringRef(parts[1]).trim().str()); + auto zero = rewriter.create( + loc, scalarTy, emitc::OpaqueAttr::get(ctx, "0")); + auto pipeVArgs = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "PIPE_V")}); + + rewriter.create(loc, TypeRange{}, "TMINS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{*tmp, src0, zero}); + rewriter.create(loc, TypeRange{}, "pipe_barrier", + pipeVArgs, ArrayAttr{}, ValueRange{}); + rewriter.create(loc, TypeRange{}, "TMUL", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, *tmp, src1}); + rewriter.create(loc, TypeRange{}, "pipe_barrier", + pipeVArgs, ArrayAttr{}, ValueRange{}); + rewriter.create(loc, TypeRange{}, "TMAXS", + ArrayAttr{}, ArrayAttr{}, + ValueRange{*tmp, src0, zero}); + rewriter.create(loc, TypeRange{}, "pipe_barrier", + pipeVArgs, ArrayAttr{}, ValueRange{}); + rewriter.create(loc, TypeRange{}, "TADD", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, dst, *tmp}); rewriter.eraseOp(op); return success(); @@ -5910,11 +6123,25 @@ struct PTOSelSToEmitC : public OpConversionPattern { Value selectMode = peelUnrealized(adaptor.getSelectMode()); Value dst = peelUnrealized(adaptor.getDst()); - SmallVector operands{dst, src0, src1, selectMode}; - rewriter.create( - loc, TypeRange{}, "TSELS", - /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, - /*operands=*/operands); + auto src0Addr = buildIntegralAddressFromTileLike(loc, src0, rewriter); + auto src1Addr = buildIntegralAddressFromTileLike(loc, src1, rewriter); + if (failed(src0Addr) || failed(src1Addr)) + return rewriter.notifyMatchFailure(op, + "failed to materialize selected tile address"); + + Value selectOne = + makeEmitCIntConstant(rewriter, loc, selectMode.getType(), 1); + Value chooseSrc0 = rewriter.create( + loc, rewriter.getI1Type(), emitc::CmpPredicate::eq, selectMode, + selectOne); + auto *ctx = rewriter.getContext(); + auto u64Ty = emitc::OpaqueType::get(ctx, "uint64_t"); + Value selectedAddr = rewriter.create( + loc, u64Ty, chooseSrc0, *src0Addr, *src1Addr); + + rewriter.create(loc, TypeRange{}, "TASSIGN", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dst, selectedAddr}); rewriter.eraseOp(op); return success(); @@ -6218,9 +6445,12 @@ struct PTOXORToEmitC : public OpConversionPattern { Value src1 = peelUnrealized(adaptor.getSrc1()); Value dst = peelUnrealized(adaptor.getDst()); - // pto-isa TXOR requires a tmp tile argument. Current NPU implementation - // does not use tmp, so we safely pass dst as tmp for compatibility. - SmallVector operands{dst, src0, src1, dst}; + auto tmp = materializeDisjointTempTile(loc, dst, {src0, src1, dst}, rewriter); + if (failed(tmp)) + return rewriter.notifyMatchFailure(op, + "failed to materialize disjoint tmp tile for TXOR"); + + SmallVector operands{dst, src0, src1, *tmp}; rewriter.create( loc, TypeRange{}, "TXOR", /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, @@ -6266,9 +6496,12 @@ struct PTOXORSToEmitC : public OpConversionPattern { Value scalar = peelUnrealized(adaptor.getScalar()); Value dst = peelUnrealized(adaptor.getDst()); - // pto-isa TXORS requires a tmp tile argument. Current NPU implementation - // does not use tmp, so we safely pass dst as tmp for compatibility. - SmallVector operands{dst, src, scalar, dst}; + auto tmp = materializeDisjointTempTile(loc, dst, {src, dst}, rewriter); + if (failed(tmp)) + return rewriter.notifyMatchFailure(op, + "failed to materialize disjoint tmp tile for TXORS"); + + SmallVector operands{dst, src, scalar, *tmp}; rewriter.create( loc, TypeRange{}, "TXORS", /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, diff --git a/test/samples/Prelu/prelu.py b/test/samples/Prelu/prelu.py index 312f0432..f41a7ead 100644 --- a/test/samples/Prelu/prelu.py +++ b/test/samples/Prelu/prelu.py @@ -23,7 +23,6 @@ def build(): fractal_ab_size = pto.TileConfig.fractalABSize cfg = pto.TileBufConfigAttr.get(bl, sl, fractal_ab_size, pd, ctx) tile_buf_32 = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) - # pto.tprelu verifier: tmp must be uint8 (TPreluCheck in pto-isa). u8 = IntegerType.get_unsigned(8, ctx) tile_buf_u8 = pto.TileBufType.get([32, 32], u8, vec, [32, 32], cfg, ctx) @@ -50,7 +49,6 @@ def build(): sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result sv1 = pto.PartitionViewOp(tile_view_32, tv1, offsets=[c0, c0], sizes=[c32, c32]).result - # tb0/tb1/tb2: f32; tb_tmp: uint8 (pto.tprelu requires tmp to be uint8_t). tb0 = pto.AllocTileOp(tile_buf_32).result tb1 = pto.AllocTileOp(tile_buf_32).result tb_tmp = pto.AllocTileOp(tile_buf_u8).result @@ -77,4 +75,4 @@ def build(): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/test/samples/Rowexpanddiv/rowexpanddiv.py b/test/samples/Rowexpanddiv/rowexpanddiv.py index 282f2cbc..f2935e1f 100644 --- a/test/samples/Rowexpanddiv/rowexpanddiv.py +++ b/test/samples/Rowexpanddiv/rowexpanddiv.py @@ -15,14 +15,18 @@ def build(): tv2_f32 = pto.TensorViewType.get(2, f32, ctx) tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tile_view_rowvec = pto.PartitionTensorViewType.get([32, 1], f32, ctx) vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + bl_col = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx) sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) fractal_ab_size = pto.TileConfig.fractalABSize cfg = pto.TileBufConfigAttr.get(bl, sl, fractal_ab_size, pd, ctx) + cfg_rowvec = pto.TileBufConfigAttr.get(bl_col, sl, fractal_ab_size, pd, ctx) tile_buf_32 = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) + tile_buf_rowvec = pto.TileBufType.get([32, 1], f32, vec, [32, 1], cfg_rowvec, ctx) fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32], []) with InsertionPoint(m.body): @@ -40,16 +44,16 @@ def build(): # %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1] # 这里用原生 builder:通常签名会是 (result_type, ptr, shape, strides) tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c32, c32], [c32, c1]).result - tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c32], [c32, c1]).result + tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c1], [c1, c1]).result tv2 = pto.MakeTensorViewOp(tv2_f32, arg2, [c32, c32], [c32, c1]).result # Replaced immediate numbers with constants c0 and c32 sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result - sv1 = pto.PartitionViewOp(tile_view_32, tv1, offsets=[c0, c0], sizes=[c32, c32]).result + sv1 = pto.PartitionViewOp(tile_view_rowvec, tv1, offsets=[c0, c0], sizes=[c32, c1]).result # %5/%6/%7 = pto.alloc_tile : <32x32xf32> tb0 = pto.AllocTileOp(tile_buf_32).result - tb1 = pto.AllocTileOp(tile_buf_32).result + tb1 = pto.AllocTileOp(tile_buf_rowvec).result tb2 = pto.AllocTileOp(tile_buf_32).result pto.TLoadOp(None, sv0, tb0) # result=None @@ -71,4 +75,4 @@ def build(): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/test/samples/Rowexpanddiv/rowexpanddiv_golden.py b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py index cf7134bc..51ead2a5 100755 --- a/test/samples/Rowexpanddiv/rowexpanddiv_golden.py +++ b/test/samples/Rowexpanddiv/rowexpanddiv_golden.py @@ -16,10 +16,12 @@ def main(): src0_name, src1_name = meta.inputs generator = rng() src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') - src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpanddiv/rowexpanddiv_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed') src0_m = matrix32(src0) - src1_m = matrix32(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + row_scalars = src1.astype(np.float32).reshape(-1) + if row_scalars.size < ROWS: + raise ValueError(f'expected at least {ROWS} row scalars, got {row_scalars.size}') + row_scalars = row_scalars[:ROWS] buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 diff --git a/test/samples/Rowexpandmul/rowexpandmul.py b/test/samples/Rowexpandmul/rowexpandmul.py index 54e93259..c1bdba31 100644 --- a/test/samples/Rowexpandmul/rowexpandmul.py +++ b/test/samples/Rowexpandmul/rowexpandmul.py @@ -15,14 +15,18 @@ def build(): tv2_f32 = pto.TensorViewType.get(2, f32, ctx) tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tile_view_rowvec = pto.PartitionTensorViewType.get([32, 1], f32, ctx) vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + bl_col = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx) sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) fractal_ab_size = pto.TileConfig.fractalABSize cfg = pto.TileBufConfigAttr.get(bl, sl, fractal_ab_size, pd, ctx) + cfg_rowvec = pto.TileBufConfigAttr.get(bl_col, sl, fractal_ab_size, pd, ctx) tile_buf_32 = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) + tile_buf_rowvec = pto.TileBufType.get([32, 1], f32, vec, [32, 1], cfg_rowvec, ctx) fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32], []) with InsertionPoint(m.body): @@ -39,16 +43,16 @@ def build(): # %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1] tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c32, c32], [c32, c1]).result - tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c32], [c32, c1]).result + tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c1], [c1, c1]).result tv2 = pto.MakeTensorViewOp(tv2_f32, arg2, [c32, c32], [c32, c1]).result # %3/%4/%8 = pto.subview %tv, offsets=[%c0,%c0], sizes=[32,32] sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result - sv1 = pto.PartitionViewOp(tile_view_32, tv1, offsets=[c0, c0], sizes=[c32, c32]).result + sv1 = pto.PartitionViewOp(tile_view_rowvec, tv1, offsets=[c0, c0], sizes=[c32, c1]).result # %5/%6/%7 = pto.alloc_tile : <32x32xf32> tb0 = pto.AllocTileOp(tile_buf_32).result - tb1 = pto.AllocTileOp(tile_buf_32).result + tb1 = pto.AllocTileOp(tile_buf_rowvec).result tb2 = pto.AllocTileOp(tile_buf_32).result # pto.load_dps_tb ins(%sv) outs(%tb) diff --git a/test/samples/Rowexpandmul/rowexpandmul_golden.py b/test/samples/Rowexpandmul/rowexpandmul_golden.py index 5bbd3405..f13fc9bc 100755 --- a/test/samples/Rowexpandmul/rowexpandmul_golden.py +++ b/test/samples/Rowexpandmul/rowexpandmul_golden.py @@ -16,10 +16,12 @@ def main(): src0_name, src1_name = meta.inputs generator = rng() src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') - src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpandmul/rowexpandmul_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='signed') src0_m = matrix32(src0) - src1_m = matrix32(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + row_scalars = src1.astype(np.float32).reshape(-1) + if row_scalars.size < ROWS: + raise ValueError(f'expected at least {ROWS} row scalars, got {row_scalars.size}') + row_scalars = row_scalars[:ROWS] buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1 diff --git a/test/samples/Rowexpandsub/rowexpandsub.py b/test/samples/Rowexpandsub/rowexpandsub.py index bc99e685..5427d0c2 100644 --- a/test/samples/Rowexpandsub/rowexpandsub.py +++ b/test/samples/Rowexpandsub/rowexpandsub.py @@ -15,14 +15,18 @@ def build(): tv2_f32 = pto.TensorViewType.get(2, f32, ctx) tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tile_view_rowvec = pto.PartitionTensorViewType.get([32, 1], f32, ctx) vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + bl_col = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx) sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) fractal_ab_size = pto.TileConfig.fractalABSize cfg = pto.TileBufConfigAttr.get(bl, sl, fractal_ab_size, pd, ctx) + cfg_rowvec = pto.TileBufConfigAttr.get(bl_col, sl, fractal_ab_size, pd, ctx) tile_buf_32 = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) + tile_buf_rowvec = pto.TileBufType.get([32, 1], f32, vec, [32, 1], cfg_rowvec, ctx) fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32], []) with InsertionPoint(m.body): @@ -39,17 +43,17 @@ def build(): # %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1] tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c32, c32], [c32, c1]).result - tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c32], [c32, c1]).result + tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c1], [c1, c1]).result tv2 = pto.MakeTensorViewOp(tv2_f32, arg2, [c32, c32], [c32, c1]).result # %3/%4/%8 = pto.subview %tv, offsets=[%c0,%c0], sizes=[32,32] # Replaced the immediate numbers with constants (c0 and c32) sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result - sv1 = pto.PartitionViewOp(tile_view_32, tv1, offsets=[c0, c0], sizes=[c32, c32]).result + sv1 = pto.PartitionViewOp(tile_view_rowvec, tv1, offsets=[c0, c0], sizes=[c32, c1]).result # %5/%6/%7 = pto.alloc_tile : <32x32xf32> tb0 = pto.AllocTileOp(tile_buf_32).result - tb1 = pto.AllocTileOp(tile_buf_32).result + tb1 = pto.AllocTileOp(tile_buf_rowvec).result tb2 = pto.AllocTileOp(tile_buf_32).result # pto.load_dps_tb ins(%sv) outs(%tb) @@ -73,4 +77,4 @@ def build(): if __name__ == "__main__": - print(build()) \ No newline at end of file + print(build()) diff --git a/test/samples/Rowexpandsub/rowexpandsub_golden.py b/test/samples/Rowexpandsub/rowexpandsub_golden.py index c6b3d67c..c7743ff6 100755 --- a/test/samples/Rowexpandsub/rowexpandsub_golden.py +++ b/test/samples/Rowexpandsub/rowexpandsub_golden.py @@ -16,10 +16,12 @@ def main(): src0_name, src1_name = meta.inputs generator = rng() src0 = float_values(generator, meta.elem_counts[src0_name], style='signed') - src1 = float_values(generator, meta.elem_counts[src1_name], style='nonzero_signed' if 'Rowexpandsub/rowexpandsub_golden.py' == 'Rowexpanddiv/rowexpanddiv_golden.py' else 'signed') + src1 = float_values(generator, meta.elem_counts[src1_name], style='signed') src0_m = matrix32(src0) - src1_m = matrix32(src1) - row_scalars = src1_m.reshape(-1)[:ROWS].astype(np.float32) + row_scalars = src1.astype(np.float32).reshape(-1) + if row_scalars.size < ROWS: + raise ValueError(f'expected at least {ROWS} row scalars, got {row_scalars.size}') + row_scalars = row_scalars[:ROWS] buffers = default_buffers(meta) buffers[src0_name] = src0 buffers[src1_name] = src1