diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ddb7307c..00731f27 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,4 +70,4 @@ jobs: - name: Run on-device examples run: | export PATH="$HOME/.local/bin:$PATH" - source /home/github-ci/Ascend/ascend-toolkit/latest/bin/setenv.bash && ./ci.sh -p a2a3 -d 5 + source /home/github-ci/Ascend/ascend-toolkit/latest/bin/setenv.bash && ./ci.sh -p a2a3 -d 4-7 --parallel diff --git a/ci.sh b/ci.sh index 4e517268..0afaa680 100755 --- a/ci.sh +++ b/ci.sh @@ -1,9 +1,9 @@ #!/bin/bash -set -e # Exit immediately if any command fails (non-zero exit code) # Parse arguments PLATFORM="" -DEVICE_ID="0" +DEVICE_RANGE="" +PARALLEL=false while [[ $# -gt 0 ]]; do case $1 in @@ -12,9 +12,13 @@ while [[ $# -gt 0 ]]; do shift 2 ;; -d|--device) - DEVICE_ID="$2" + DEVICE_RANGE="$2" shift 2 ;; + --parallel) + PARALLEL=true + shift + ;; *) echo "Unknown option: $1" exit 1 @@ -22,98 +26,279 @@ while [[ $# -gt 0 ]]; do esac done +# Parse device range (e.g., "5-8" or "5") +if [[ "$DEVICE_RANGE" == *-* ]]; then + IFS='-' read -r DEV_START DEV_END <<< "$DEVICE_RANGE" + DEVICES=() + for ((i=DEV_START; i<=DEV_END; i++)); do + DEVICES+=("$i") + done +else + DEVICES=("${DEVICE_RANGE:-0}") +fi +NUM_DEVICES=${#DEVICES[@]} + OS=$(uname -s) echo "Running tests on $OS..." -# Run pytest +OVERALL_EXIT=0 + +# Run pytest synchronously first if [[ -d "tests" && "$OS" == "Linux" && "$PLATFORM" != "a2a3sim" ]]; then echo "Running pytest tests..." - pytest tests -v + if ! pytest tests -v; then + echo "PYTEST FAILED" + OVERALL_EXIT=1 + fi fi -# Discover and run all examples (recursively search for valid example directories) +# Setup temp directory for logs and results +LOG_DIR=$(mktemp -d "${TMPDIR:-/tmp}/ci_parallel_$$.XXXXXX") +RESULTS_FILE="${LOG_DIR}/results.txt" +touch "$RESULTS_FILE" + +cleanup() { + kill 0 2>/dev/null + rm -rf "$LOG_DIR" + exit 130 +} +trap cleanup INT TERM +trap 'rm -rf "$LOG_DIR"' EXIT + +# ---- Discover all tasks ---- EXAMPLES_DIR="examples" -while IFS= read -r -d '' example_dir; do - # Skip the scripts directory - if [[ "$example_dir" == *"/scripts" ]]; then - continue - fi +DEVICE_TESTS_DIR="tests/device_tests" - # Check if this is a valid example (has kernels/kernel_config.py and golden.py) +declare -a HW_TASK_NAMES=() +declare -a HW_TASK_DIRS=() +declare -a SIM_TASK_NAMES=() +declare -a SIM_TASK_DIRS=() + +# Discover examples +while IFS= read -r -d '' example_dir; do + [[ "$example_dir" == *"/scripts" ]] && continue kernel_config="${example_dir}/kernels/kernel_config.py" golden="${example_dir}/golden.py" + [[ -f "$kernel_config" && -f "$golden" ]] || continue - if [[ -f "$kernel_config" && -f "$golden" ]]; then - # Get relative path from examples directory for display - example_name="${example_dir#$EXAMPLES_DIR/}" - echo "========================================" - echo "Running example: $example_name" - echo "========================================" + example_name="${example_dir#$EXAMPLES_DIR/}" - # If platform is specified, use it - if [[ -n "$PLATFORM" ]]; then - if [[ "$PLATFORM" == "a2a3" ]]; then - python examples/scripts/run_example.py \ - -k "${example_dir}/kernels" \ - -g "$golden" \ - -p "$PLATFORM" -d "$DEVICE_ID" - else - python examples/scripts/run_example.py \ - -k "${example_dir}/kernels" \ - -g "$golden" \ - -p "$PLATFORM" - fi - # Otherwise, use OS-based defaults - elif [ "$OS" = "Darwin" ]; then - # Mac: only simulation - python examples/scripts/run_example.py \ - -k "${example_dir}/kernels" \ - -g "$golden" \ - -p a2a3sim + if [[ -n "$PLATFORM" ]]; then + if [[ "$PLATFORM" == "a2a3" ]]; then + HW_TASK_NAMES+=("example:${example_name}") + HW_TASK_DIRS+=("${example_dir}") else - # Linux: both hardware and simulation - python examples/scripts/run_example.py \ - -k "${example_dir}/kernels" \ - -g "$golden" \ - -p a2a3 -d "$DEVICE_ID" - python examples/scripts/run_example.py \ - -k "${example_dir}/kernels" \ - -g "$golden" \ - -p a2a3sim + SIM_TASK_NAMES+=("example:${example_name}") + SIM_TASK_DIRS+=("${example_dir}") fi + elif [[ "$OS" == "Darwin" ]]; then + SIM_TASK_NAMES+=("example:${example_name}") + SIM_TASK_DIRS+=("${example_dir}") + else + HW_TASK_NAMES+=("example:${example_name}") + HW_TASK_DIRS+=("${example_dir}") + SIM_TASK_NAMES+=("example:${example_name}") + SIM_TASK_DIRS+=("${example_dir}") fi done < <(find "$EXAMPLES_DIR" -mindepth 1 -type d -print0 | sort -z) -# Discover and run device tests (a2a3 hardware only - never run on simulator) -DEVICE_TESTS_DIR="tests/device_tests" +# Discover device tests (hardware only) if [[ -d "$DEVICE_TESTS_DIR" ]]; then RUN_DEVICE_TESTS=false - if [[ "$PLATFORM" == "a2a3" ]]; then - RUN_DEVICE_TESTS=true - elif [[ -z "$PLATFORM" && "$OS" == "Linux" ]]; then - RUN_DEVICE_TESTS=true - fi + [[ "$PLATFORM" == "a2a3" ]] && RUN_DEVICE_TESTS=true + [[ -z "$PLATFORM" && "$OS" == "Linux" ]] && RUN_DEVICE_TESTS=true if [[ "$RUN_DEVICE_TESTS" == "true" ]]; then while IFS= read -r -d '' test_dir; do kernel_config="${test_dir}/kernels/kernel_config.py" golden="${test_dir}/golden.py" - - if [[ -f "$kernel_config" && -f "$golden" ]]; then - test_name="${test_dir#$DEVICE_TESTS_DIR/}" - echo "========================================" - echo "Running device test: $test_name" - echo "========================================" - - python examples/scripts/run_example.py \ - -k "${test_dir}/kernels" \ - -g "$golden" \ - -p a2a3 -d "$DEVICE_ID" - fi + [[ -f "$kernel_config" && -f "$golden" ]] || continue + test_name="${test_dir#$DEVICE_TESTS_DIR/}" + HW_TASK_NAMES+=("device_test:${test_name}") + HW_TASK_DIRS+=("${test_dir}") done < <(find "$DEVICE_TESTS_DIR" -mindepth 1 -type d -print0 | sort -z) else echo "Skipping device tests (a2a3 hardware only)" fi fi +echo "Discovered ${#HW_TASK_NAMES[@]} hardware tasks, ${#SIM_TASK_NAMES[@]} simulation tasks" + +MAX_RETRIES=3 + +# Run a single HW task with retry across different devices (max 3 attempts). +# Writes final result to RESULTS_FILE. Each attempt logged separately. +# Usage: run_hw_task_with_retry [initial_device_id] +run_hw_task_with_retry() { + local name="$1" + local dir="$2" + local initial_device="${3:-${DEVICES[0]}}" + local safe_name="${name//[:\/]/_}" + local tried_devices=() + local failed_devices=() + + for attempt in $(seq 1 $MAX_RETRIES); do + # First attempt uses the assigned device, retries pick next untried + local device_id="" + if [[ $attempt -eq 1 ]]; then + device_id="$initial_device" + else + for dev in "${DEVICES[@]}"; do + local already_tried=false + for tried in "${tried_devices[@]}"; do + [[ "$tried" == "$dev" ]] && { already_tried=true; break; } + done + if [[ "$already_tried" == "false" ]]; then + device_id="$dev" + break + fi + done + fi + + # No untried device left + if [[ -z "$device_id" ]]; then + echo "${name}:a2a3|FAIL|failed_on:${failed_devices[*]}|no_device_left" >> "$RESULTS_FILE" + return 1 + fi + + tried_devices+=("$device_id") + local task_log="${LOG_DIR}/${safe_name}_hw_attempt${attempt}.log" + + { + echo "========================================" + echo "[Device $device_id] Running: $name (attempt $attempt/$MAX_RETRIES)" + echo "========================================" + python examples/scripts/run_example.py \ + -k "${dir}/kernels" -g "${dir}/golden.py" \ + -p a2a3 -d "$device_id" + } > "$task_log" 2>&1 + local rc=$? + + if [[ $rc -eq 0 ]]; then + echo "${name}:a2a3|PASS|device:${device_id}|attempt:${attempt}" >> "$RESULTS_FILE" + return 0 + else + failed_devices+=("$device_id") + echo "[Retry] $name failed on device $device_id (attempt $attempt/$MAX_RETRIES)" >&2 + fi + done + + # All retries exhausted + echo "${name}:a2a3|FAIL|failed_on:${failed_devices[*]}|attempts:${MAX_RETRIES}" >> "$RESULTS_FILE" + return 1 +} + +# ---- Sequential mode ---- +if [[ "$PARALLEL" == "false" ]]; then + for i in "${!HW_TASK_NAMES[@]}"; do + run_hw_task_with_retry "${HW_TASK_NAMES[$i]}" "${HW_TASK_DIRS[$i]}" + done + for i in "${!SIM_TASK_NAMES[@]}"; do + name="${SIM_TASK_NAMES[$i]}" + dir="${SIM_TASK_DIRS[$i]}" + echo "========================================" + echo "Running: $name (a2a3sim)" + echo "========================================" + if python examples/scripts/run_example.py \ + -k "${dir}/kernels" -g "${dir}/golden.py" \ + -p a2a3sim; then + echo "${name}:a2a3sim|PASS" >> "$RESULTS_FILE" + else + echo "${name}:a2a3sim|FAIL" >> "$RESULTS_FILE" + fi + done +else + # ---- Parallel mode ---- + declare -a WORKER_PIDS=() + + # Launch sim tasks in parallel (no device constraint) + for i in "${!SIM_TASK_NAMES[@]}"; do + name="${SIM_TASK_NAMES[$i]}" + dir="${SIM_TASK_DIRS[$i]}" + safe_name="${name//[:\/]/_}" + log_file="${LOG_DIR}/${safe_name}_sim.log" + + ( + echo "========================================" + echo "Running: $name (a2a3sim)" + echo "========================================" + if python examples/scripts/run_example.py \ + -k "${dir}/kernels" -g "${dir}/golden.py" -p a2a3sim; then + echo "${name}:a2a3sim|PASS" >> "$RESULTS_FILE" + else + echo "${name}:a2a3sim|FAIL" >> "$RESULTS_FILE" + fi + ) > "$log_file" 2>&1 & + WORKER_PIDS+=($!) + done + + # Launch HW tasks with round-robin device assignment + # Each task runs with retry logic: on failure, retries on a different device + for d in $(seq 0 $((NUM_DEVICES - 1))); do + device_id="${DEVICES[$d]}" + + # Collect task indices for this device slot + slot_indices=() + for i in "${!HW_TASK_NAMES[@]}"; do + if [[ $((i % NUM_DEVICES)) -eq $d ]]; then + slot_indices+=("$i") + fi + done + [[ ${#slot_indices[@]} -eq 0 ]] && continue + + worker_log="${LOG_DIR}/device_${device_id}_worker.log" + + ( + for idx in "${slot_indices[@]}"; do + run_hw_task_with_retry "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "$device_id" + done + ) > "$worker_log" 2>&1 & + WORKER_PIDS+=($!) + done + + # Wait for all workers + for pid in "${WORKER_PIDS[@]}"; do + wait "$pid" 2>/dev/null || true + done +fi + +# ---- Print summary ---- +echo "" +echo "========================================" +echo " CI RESULTS SUMMARY" +echo "========================================" +printf "%-55s %s\n" "TASK" "RESULT" +printf "%-55s %s\n" "----" "------" + +FAIL_COUNT=0 +PASS_COUNT=0 +while IFS='|' read -r task_name result extra1 extra2; do + if [[ "$result" == "FAIL" ]]; then + printf "%-55s \033[31mFAIL\033[0m (%s)\n" "$task_name" "${extra1:+$extra1 }${extra2}" + ((FAIL_COUNT++)) + # Print all attempt logs inline + safe_name="${task_name//[:\/]/_}" + for attempt_log in "${LOG_DIR}/${safe_name}_hw_attempt"*.log "${LOG_DIR}/${safe_name}_sim.log"; do + if [[ -f "$attempt_log" ]]; then + echo "--- LOG: $(basename "$attempt_log") ---" + cat "$attempt_log" + echo "--- END ---" + echo "" + fi + done + else + local_info="" + [[ -n "$extra1" ]] && local_info=" ($extra1, $extra2)" + printf "%-55s \033[32mPASS\033[0m%s\n" "$task_name" "$local_info" + ((PASS_COUNT++)) + fi +done < "$RESULTS_FILE" + +echo "========================================" +echo "Total: $((PASS_COUNT + FAIL_COUNT)) Passed: $PASS_COUNT Failed: $FAIL_COUNT" +echo "========================================" + +if [[ $FAIL_COUNT -gt 0 || $OVERALL_EXIT -ne 0 ]]; then + exit 1 +fi echo "All tests passed!"