From 563a7d0c56862717388a781033cd650980609a51 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 15:55:18 +0000
Subject: [PATCH 1/7] Initial plan


From 9f7e3591363e8c284fb7e5dc53b1d4c65a74adfa Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 16:00:02 +0000
Subject: [PATCH 2/7] Add ace-understand support: analyze audio URL and return
 JSON metadata

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 .github/workflows/test.yml | 24 +++++++++++++++
 Dockerfile                 |  3 +-
 README.md                  | 28 ++++++++++++++++++
 action.yml                 |  6 ++++
 src/entrypoint.sh          | 60 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 88f1c1a..00e460b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -50,3 +50,27 @@ jobs:
           path: ${{ github.workspace }}/output.wav
           retention-days: 5
 
+  test-understand:
+    runs-on: ubuntu-latest
+    name: Test Audio Understanding
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Analyze audio with ace-understand
+        id: analyze
+        uses: ./
+        with:
+          understand: 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3'
+
+      - name: Check understand output
+        run: |
+          echo "understand_result: ${{ steps.analyze.outputs.understand_result }}"
+          # Verify the output is non-empty valid JSON
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq .
+          # Verify expected metadata fields are present
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("caption")'
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("lyrics")'
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("bpm")'
+
diff --git a/Dockerfile b/Dockerfile
index e97c855..0d26d97 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update && \
         libopenblas-dev \
         python3-pip \
         jq \
+        curl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/*
 
@@ -33,7 +34,7 @@ RUN git clone --depth 1 --recurse-submodules \
     cmake --install . --prefix /usr/local && \
     ldconfig && \
     mkdir -p /action/bin && \
-    cp ace-qwen3 dit-vae ace-undestand /action/bin/ && \
+    cp ace-qwen3 dit-vae ace-understand /action/bin/ && \
     cd / && rm -rf /tmp/acestep-cpp
 
 # ---------------------------------------------------------------------------
diff --git a/README.md b/README.md
index 5ce34c0..35d64cf 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ The pre-built Docker image ships with compiled `ace-qwen3`/`dit-vae` binaries **
 
 - 🎵 Generate high-quality music from a text caption
 - 🖊️ Optional lyrics — or let the LLM write them for you
+- 🔍 Analyze existing audio with `ace-understand` — extract caption, lyrics, BPM, key, duration, and language
 - ⚡ Native C++17 / GGML engine — lightweight, no GPU required
 - 🐳 Pre-built Docker image with models included — zero download wait
 - 🎲 Reproducible generation with optional seed
@@ -66,6 +67,23 @@ jobs:
     path: ${{ steps.audio.outputs.audio_file }}
 ```
 
+### Analyze an existing audio file
+
+Supply a URL to an MP3 or WAV file via the `understand` input.  
+The action downloads the audio, runs `ace-understand`, and returns the analysis as JSON in the `understand_result` output.  
+Audio generation is **skipped** when `understand` is set.
+
+```yaml
+- name: Analyze audio
+  id: analyze
+  uses: audiohacking/acestep-action@main
+  with:
+    understand: 'https://example.com/song.mp3'
+
+- name: Show analysis
+  run: echo '${{ steps.analyze.outputs.understand_result }}'
+```
+
 ## Inputs
 
 | Input | Description | Required | Default |
@@ -78,6 +96,7 @@ jobs:
 | `shift` | Flow-matching shift parameter | No | `3` |
 | `vocal_language` | Vocal language code (`en`, `fr`, …) | No | `en` |
 | `output_path` | Output path for the generated WAV file | No | `output.wav` |
+| `understand` | URL to an MP3 or WAV file to analyze (activates understand mode — skips generation) | No | _(empty)_ |
 
 ## Outputs
 
@@ -85,6 +104,7 @@ jobs:
 |--------|-------------|
 | `audio_file` | Path to the generated WAV audio file |
 | `generation_time` | Time taken to generate the audio in seconds |
+| `understand_result` | JSON from `ace-understand`: caption, lyrics, BPM, key, duration, language |
 
 ## How it works
 
@@ -94,17 +114,25 @@ The action runs as a **pre-built Docker container** published to GitHub Containe
 |------|---------------|
 | `ace-qwen3` binary (Qwen3 causal LM) | `/action/bin/ace-qwen3` |
 | `dit-vae` binary (DiT + Oobleck VAE) | `/action/bin/dit-vae` |
+| `ace-understand` binary (reverse pipeline) | `/action/bin/ace-understand` |
 | `Qwen3-Embedding-0.6B-Q8_0.gguf` | `/action/models/` |
 | `acestep-5Hz-lm-4B-Q8_0.gguf` | `/action/models/` |
 | `acestep-v15-turbo-Q8_0.gguf` | `/action/models/` |
 | `vae-BF16.gguf` | `/action/models/` |
 
 At runtime the entrypoint (`src/entrypoint.sh`):
+
+**Generation mode** (default — when `understand` is not set):
 1. Builds a request JSON from inputs
 2. Runs `ace-qwen3` (LLM stage: caption → enriched JSON with lyrics + audio codes)
 3. Runs `dit-vae` (DiT + VAE stage: JSON → stereo 48 kHz WAV)
 4. Moves the output WAV to the requested path in `$GITHUB_WORKSPACE`
 
+**Understand mode** (when `understand` URL is provided):
+1. Downloads the audio file from the supplied URL
+2. Runs `ace-understand` (VAE encode → FSQ tokenize → LM understand → JSON)
+3. Emits the resulting JSON as the `understand_result` action output
+
 **Image location:** `ghcr.io/audiohacking/acestep-action:latest`
 
 ## Project structure
diff --git a/action.yml b/action.yml
index 91973db..6a5bea4 100644
--- a/action.yml
+++ b/action.yml
@@ -39,12 +39,18 @@ inputs:
     description: 'Output path for the generated WAV file (relative to workspace or absolute)'
     required: false
     default: 'output.wav'
+  understand:
+    description: 'URL to an MP3 or WAV audio file to analyze with ace-understand (activates understand mode — skips audio generation)'
+    required: false
+    default: ''
 
 outputs:
   audio_file:
     description: 'Path to the generated WAV audio file'
   generation_time:
     description: 'Time taken to generate the audio in seconds'
+  understand_result:
+    description: 'JSON output from ace-understand containing caption, lyrics, bpm, key, duration, and language analysis of the provided audio'
 
 runs:
   using: 'docker'
diff --git a/src/entrypoint.sh b/src/entrypoint.sh
index 21b63a9..0fd4f96 100644
--- a/src/entrypoint.sh
+++ b/src/entrypoint.sh
@@ -23,6 +23,7 @@ INFERENCE_STEPS="${INPUT_INFERENCE_STEPS:-8}"
 SHIFT="${INPUT_SHIFT:-3}"
 VOCAL_LANGUAGE="${INPUT_VOCAL_LANGUAGE:-en}"
 OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}"
+UNDERSTAND="${INPUT_UNDERSTAND:-}"
 
 # ---------------------------------------------------------------------------
 # Fixed in-image paths
@@ -31,6 +32,7 @@ OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}"
 MODEL_DIR="/action/models"
 ACE_QWEN3="/action/bin/ace-qwen3"
 DIT_VAE="/action/bin/dit-vae"
+ACE_UNDERSTAND="/action/bin/ace-understand"
 
 # ---------------------------------------------------------------------------
 # Validate binaries
@@ -45,6 +47,63 @@ if [ ! -x "$DIT_VAE" ]; then
     exit 1
 fi
 
+# ---------------------------------------------------------------------------
+# Understand mode — download audio and run ace-understand, then exit
+# ---------------------------------------------------------------------------
+
+if [ -n "${UNDERSTAND}" ]; then
+    if [ ! -x "$ACE_UNDERSTAND" ]; then
+        echo "Error: ace-understand binary not found at $ACE_UNDERSTAND" >&2
+        exit 1
+    fi
+
+    WORK_DIR=$(mktemp -d)
+    trap 'rm -rf "$WORK_DIR"' EXIT
+
+    echo "=== ace-understand mode ==="
+    echo "UNDERSTAND=${UNDERSTAND}"
+
+    # Derive a sensible local filename from the URL extension
+    case "${UNDERSTAND}" in
+        *.wav|*.WAV) AUDIO_FILE="$WORK_DIR/input.wav" ;;
+        *)           AUDIO_FILE="$WORK_DIR/input.mp3" ;;
+    esac
+
+    echo ""
+    echo "=== Downloading audio ==="
+    curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}"
+    echo "Downloaded: $(ls -lh "${AUDIO_FILE}")"
+
+    UNDERSTAND_OUTPUT="$WORK_DIR/understand_result.json"
+
+    echo ""
+    echo "=== Running ace-understand ==="
+    "$ACE_UNDERSTAND" \
+        --src-audio "${AUDIO_FILE}" \
+        --dit       "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \
+        --vae       "$MODEL_DIR/vae-BF16.gguf" \
+        --model     "$MODEL_DIR/acestep-5Hz-lm-4B-Q8_0.gguf" \
+        -o          "${UNDERSTAND_OUTPUT}"
+
+    echo ""
+    echo "=== ace-understand result ==="
+    cat "${UNDERSTAND_OUTPUT}"
+
+    # Set GitHub Actions output (multiline-safe heredoc with random delimiter)
+    if [ -n "${GITHUB_OUTPUT:-}" ]; then
+        DELIM="EOF_$$_${RANDOM}"
+        {
+            echo "understand_result<<${DELIM}"
+            cat "${UNDERSTAND_OUTPUT}"
+            echo "${DELIM}"
+        } >> "$GITHUB_OUTPUT"
+    fi
+
+    echo ""
+    echo "=== ace-understand complete ==="
+    exit 0
+fi
+
 # ---------------------------------------------------------------------------
 # Resolve output path: relative paths are relative to $GITHUB_WORKSPACE
 # ---------------------------------------------------------------------------
@@ -71,6 +130,7 @@ echo "INFERENCE_STEPS=${INFERENCE_STEPS}"
 echo "SHIFT=${SHIFT}"
 echo "VOCAL_LANGUAGE=${VOCAL_LANGUAGE}"
 echo "OUTPUT_PATH=${OUTPUT_PATH}"
+echo "UNDERSTAND=${UNDERSTAND}"
 echo "WORKSPACE_ROOT=${WORKSPACE_ROOT}"
 echo "GITHUB_WORKSPACE=${GITHUB_WORKSPACE:-<unset>}"
 

From 14c81c6a1814271baf2fb0c652f2495779712d06 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 16:04:52 +0000
Subject: [PATCH 3/7] plan: understand input accepts local path or URL

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 .github/workflows/test.yml | 11 +++++++++--
 src/entrypoint.sh          |  4 ++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 00e460b..5750c9d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,16 +53,23 @@ jobs:
   test-understand:
     runs-on: ubuntu-latest
     name: Test Audio Understanding
+    needs: test
 
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Analyze audio with ace-understand
+      - name: Download generated audio from test job
+        uses: actions/download-artifact@v4
+        with:
+          name: generated-audio
+          path: ${{ github.workspace }}
+
+      - name: Analyze audio with ace-understand (dogfooding generated output)
         id: analyze
         uses: ./
         with:
-          understand: 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3'
+          understand: 'file:///github/workspace/output.wav'  # /github/workspace is $GITHUB_WORKSPACE bind-mounted inside the Docker action container
 
       - name: Check understand output
         run: |
diff --git a/src/entrypoint.sh b/src/entrypoint.sh
index 0fd4f96..bc1076e 100644
--- a/src/entrypoint.sh
+++ b/src/entrypoint.sh
@@ -72,6 +72,10 @@ if [ -n "${UNDERSTAND}" ]; then
     echo ""
     echo "=== Downloading audio ==="
     curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}"
+    if [ ! -s "${AUDIO_FILE}" ]; then
+        echo "Error: downloaded audio file is missing or empty at ${AUDIO_FILE}" >&2
+        exit 1
+    fi
     echo "Downloaded: $(ls -lh "${AUDIO_FILE}")"
 
     UNDERSTAND_OUTPUT="$WORK_DIR/understand_result.json"

From 3f19998e8be9a8fc75d5c66045dbba3c8115c505 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 16:06:32 +0000
Subject: [PATCH 4/7] understand input: accept local path or URL, CI dogfoods
 generated audio directly

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 .github/workflows/test.yml |  2 +-
 README.md                  | 21 +++++++++++++------
 action.yml                 |  2 +-
 src/entrypoint.sh          | 41 +++++++++++++++++++++++++++-----------
 4 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5750c9d..05acb28 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -69,7 +69,7 @@ jobs:
         id: analyze
         uses: ./
         with:
-          understand: 'file:///github/workspace/output.wav'  # /github/workspace is $GITHUB_WORKSPACE bind-mounted inside the Docker action container
+          understand: '/github/workspace/output.wav'
 
       - name: Check understand output
         run: |
diff --git a/README.md b/README.md
index 35d64cf..6f56a87 100644
--- a/README.md
+++ b/README.md
@@ -69,17 +69,26 @@ jobs:
 
 ### Analyze an existing audio file
 
-Supply a URL to an MP3 or WAV file via the `understand` input.  
-The action downloads the audio, runs `ace-understand`, and returns the analysis as JSON in the `understand_result` output.  
+Supply a local file path or a URL (http/https) to an MP3 or WAV file via the `understand` input.  
+The action uses the file directly if a path is given, or downloads it first if a URL is provided.  
+It then runs `ace-understand` and returns the analysis as JSON in the `understand_result` output.  
 Audio generation is **skipped** when `understand` is set.
 
 ```yaml
-- name: Analyze audio
+# From a URL
+- name: Analyze audio (URL)
   id: analyze
   uses: audiohacking/acestep-action@main
   with:
     understand: 'https://example.com/song.mp3'
 
+# From a local path (e.g. a file already in the workspace)
+- name: Analyze audio (local file)
+  id: analyze
+  uses: audiohacking/acestep-action@main
+  with:
+    understand: '/github/workspace/output.wav'
+
 - name: Show analysis
   run: echo '${{ steps.analyze.outputs.understand_result }}'
 ```
@@ -96,7 +105,7 @@ Audio generation is **skipped** when `understand` is set.
 | `shift` | Flow-matching shift parameter | No | `3` |
 | `vocal_language` | Vocal language code (`en`, `fr`, …) | No | `en` |
 | `output_path` | Output path for the generated WAV file | No | `output.wav` |
-| `understand` | URL to an MP3 or WAV file to analyze (activates understand mode — skips generation) | No | _(empty)_ |
+| `understand` | Local file path or URL (http/https) to an MP3 or WAV file to analyze (activates understand mode — skips generation) | No | _(empty)_ |
 
 ## Outputs
 
@@ -128,8 +137,8 @@ At runtime the entrypoint (`src/entrypoint.sh`):
 3. Runs `dit-vae` (DiT + VAE stage: JSON → stereo 48 kHz WAV)
 4. Moves the output WAV to the requested path in `$GITHUB_WORKSPACE`
 
-**Understand mode** (when `understand` URL is provided):
-1. Downloads the audio file from the supplied URL
+**Understand mode** (when `understand` is provided):
+1. If a URL (http/https/ftp/file) is given, downloads the audio file; if a local path is given, uses it directly
 2. Runs `ace-understand` (VAE encode → FSQ tokenize → LM understand → JSON)
 3. Emits the resulting JSON as the `understand_result` action output
 
diff --git a/action.yml b/action.yml
index 6a5bea4..ba344a5 100644
--- a/action.yml
+++ b/action.yml
@@ -40,7 +40,7 @@ inputs:
     required: false
     default: 'output.wav'
   understand:
-    description: 'URL to an MP3 or WAV audio file to analyze with ace-understand (activates understand mode — skips audio generation)'
+    description: 'Local file path or URL (http/https) to an MP3 or WAV audio file to analyze with ace-understand (activates understand mode — skips audio generation)'
     required: false
     default: ''
 
diff --git a/src/entrypoint.sh b/src/entrypoint.sh
index bc1076e..d608136 100644
--- a/src/entrypoint.sh
+++ b/src/entrypoint.sh
@@ -63,21 +63,38 @@ if [ -n "${UNDERSTAND}" ]; then
     echo "=== ace-understand mode ==="
     echo "UNDERSTAND=${UNDERSTAND}"
 
-    # Derive a sensible local filename from the URL extension
+    # Determine whether the input is a URL (requires curl) or a local path
     case "${UNDERSTAND}" in
-        *.wav|*.WAV) AUDIO_FILE="$WORK_DIR/input.wav" ;;
-        *)           AUDIO_FILE="$WORK_DIR/input.mp3" ;;
+        http://*|https://*|ftp://*|file://*)
+            # URL — derive a local filename from the extension and download
+            case "${UNDERSTAND}" in
+                *.wav|*.WAV) AUDIO_FILE="$WORK_DIR/input.wav" ;;
+                *)           AUDIO_FILE="$WORK_DIR/input.mp3" ;;
+            esac
+            echo ""
+            echo "=== Downloading audio ==="
+            curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}"
+            if [ ! -s "${AUDIO_FILE}" ]; then
+                echo "Error: downloaded audio file is missing or empty at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            echo "Downloaded: $(ls -lh "${AUDIO_FILE}")"
+            ;;
+        *)
+            # Local path — use directly (no download needed)
+            AUDIO_FILE="${UNDERSTAND}"
+            if [ ! -f "${AUDIO_FILE}" ]; then
+                echo "Error: local audio file not found at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            if [ ! -s "${AUDIO_FILE}" ]; then
+                echo "Error: local audio file is empty at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            echo "Using local file: $(ls -lh "${AUDIO_FILE}")"
+            ;;
     esac
 
-    echo ""
-    echo "=== Downloading audio ==="
-    curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}"
-    if [ ! -s "${AUDIO_FILE}" ]; then
-        echo "Error: downloaded audio file is missing or empty at ${AUDIO_FILE}" >&2
-        exit 1
-    fi
-    echo "Downloaded: $(ls -lh "${AUDIO_FILE}")"
-
     UNDERSTAND_OUTPUT="$WORK_DIR/understand_result.json"
 
     echo ""

From f705ca87cff771d07b93585a1a7a3c52f8f65180 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 14 Mar 2026 17:28:30 +0100
Subject: [PATCH 5/7] Change audio output format from WAV to MP3

---
 .github/workflows/test.yml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 05acb28..352acfa 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,18 +27,13 @@ jobs:
           lyrics: ''
           duration: '5'
           inference_steps: '4'
-          output_path: 'output.wav'
+          output_path: 'request00.mp3'
 
       - name: Check output
         run: |
           echo "Generated audio file: ${{ steps.generate.outputs.audio_file }}"
           echo "Generation time: ${{ steps.generate.outputs.generation_time }} seconds"
-          ls -lh "${{ github.workspace }}/output.wav"
-
-      - name: Debug - verify workspace file presence and format
-        run: |
-          ls -lh "${{ github.workspace }}/output.wav"
-          cat "${{ github.workspace }}/output.wav" | head -c 44 | base64
+          ls -lh "${{ github.workspace }}/request00.mp3"
 
       - name: List workspace before upload
         run: ls -lh ${{ github.workspace }}
@@ -47,7 +42,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: generated-audio
-          path: ${{ github.workspace }}/output.wav
+          path: ${{ github.workspace }}/request00.mp3
           retention-days: 5
 
   test-understand:
@@ -69,7 +64,7 @@ jobs:
         id: analyze
         uses: ./
         with:
-          understand: '/github/workspace/output.wav'
+          understand: '/github/workspace/request00.mp3'
 
       - name: Check understand output
         run: |

From ec2fec502978ff6f046ff50d11b1034000104723 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 14 Mar 2026 17:31:24 +0100
Subject: [PATCH 6/7] Change audio file format from WAV to MP3

---
 src/entrypoint.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/entrypoint.sh b/src/entrypoint.sh
index d608136..fc3e5bc 100644
--- a/src/entrypoint.sh
+++ b/src/entrypoint.sh
@@ -68,7 +68,7 @@ if [ -n "${UNDERSTAND}" ]; then
         http://*|https://*|ftp://*|file://*)
             # URL — derive a local filename from the extension and download
             case "${UNDERSTAND}" in
-                *.wav|*.WAV) AUDIO_FILE="$WORK_DIR/input.wav" ;;
+                *.mp3|*.MP3) AUDIO_FILE="$WORK_DIR/input.mp3" ;;
                 *)           AUDIO_FILE="$WORK_DIR/input.mp3" ;;
             esac
             echo ""
@@ -133,7 +133,7 @@ WORKSPACE_ROOT="${GITHUB_WORKSPACE:-/github/workspace}"
 
 # Default to workspace root if not specified
 if [ -z "$OUTPUT_PATH" ]; then
-    OUTPUT_PATH="${WORKSPACE_ROOT}/output.wav"
+    OUTPUT_PATH="${WORKSPACE_ROOT}/output.mp3"
 elif [[ "$OUTPUT_PATH" != /* ]]; then
     OUTPUT_PATH="${WORKSPACE_ROOT}/${OUTPUT_PATH}"
 fi
@@ -215,7 +215,7 @@ echo "=== Stage 1: ace-qwen3 (LLM) ==="
 REQUEST0_FILE="${REQUEST_FILE%.json}0.json"
 
 # ---------------------------------------------------------------------------
-# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.wav
+# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.mp3
 # ---------------------------------------------------------------------------
 
 echo ""
@@ -226,8 +226,8 @@ echo "=== Stage 2: dit-vae (DiT + VAE) ==="
     --dit          "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \
     --vae          "$MODEL_DIR/vae-BF16.gguf"
 
-# dit-vae writes requestN0.wav alongside the request0.json file
-OUTPUT_WAV="${REQUEST0_FILE%.json}0.wav"
+# dit-vae writes requestN0.mp3 alongside the request0.json file
+OUTPUT_WAV="${REQUEST0_FILE%.json}0.mp3"
 
 echo "=== Directory listings (pre-move) ==="
 echo "WORK_DIR=${WORK_DIR}"
@@ -272,7 +272,7 @@ fi
 # ---------------------------------------------------------------------------
 
 ACTIONS_WORKSPACE="/github/workspace"
-ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.wav"
+ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.mp3"
 
 echo "=== Directory listings (pre-copy) ==="
 ls -lh "$ACTIONS_WORKSPACE" || echo "(ls ${ACTIONS_WORKSPACE} failed)"
@@ -309,7 +309,7 @@ fi
 # ---------------------------------------------------------------------------
 
 if [ -n "${GITHUB_OUTPUT:-}" ]; then
-    # Always point to /github/workspace/output.wav so subsequent steps can
+    # Always point to /github/workspace/output.mp3 so subsequent steps can
     # reliably access the file regardless of what output_path was specified.
     echo "audio_file=${ACTIONS_OUTPUT}" >> "$GITHUB_OUTPUT"
     echo "generation_time=${GENERATION_TIME}" >> "$GITHUB_OUTPUT"

From cad1934df7e89facb0f630222e67404d15001226 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 14 Mar 2026 17:34:24 +0100
Subject: [PATCH 7/7] Rename audio file references from request00.mp3 to
 output.mp3

---
 .github/workflows/test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 352acfa..3a8781d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,13 +27,13 @@ jobs:
           lyrics: ''
           duration: '5'
           inference_steps: '4'
-          output_path: 'request00.mp3'
+          output_path: 'output.mp3'
 
       - name: Check output
         run: |
           echo "Generated audio file: ${{ steps.generate.outputs.audio_file }}"
           echo "Generation time: ${{ steps.generate.outputs.generation_time }} seconds"
-          ls -lh "${{ github.workspace }}/request00.mp3"
+          ls -lh "${{ github.workspace }}/output.mp3"
 
       - name: List workspace before upload
         run: ls -lh ${{ github.workspace }}
@@ -42,7 +42,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: generated-audio
-          path: ${{ github.workspace }}/request00.mp3
+          path: ${{ github.workspace }}/output.mp3
           retention-days: 5
 
   test-understand:
@@ -64,7 +64,7 @@ jobs:
         id: analyze
         uses: ./
         with:
-          understand: '/github/workspace/request00.mp3'
+          understand: '/github/workspace/output.mp3'
 
       - name: Check understand output
         run: |