audiohacking · lmangani · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,18 +27,13 @@ jobs:
           lyrics: ''
           duration: '5'
           inference_steps: '4'
-          output_path: 'output.wav'
+          output_path: 'output.mp3'
 
       - name: Check output
         run: |
           echo "Generated audio file: ${{ steps.generate.outputs.audio_file }}"
           echo "Generation time: ${{ steps.generate.outputs.generation_time }} seconds"
-          ls -lh "${{ github.workspace }}/output.wav"
-
-      - name: Debug - verify workspace file presence and format
-        run: |
-          ls -lh "${{ github.workspace }}/output.wav"
-          cat "${{ github.workspace }}/output.wav" | head -c 44 | base64
+          ls -lh "${{ github.workspace }}/output.mp3"
 
       - name: List workspace before upload
         run: ls -lh ${{ github.workspace }}
@@ -47,6 +42,37 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: generated-audio
-          path: ${{ github.workspace }}/output.wav
+          path: ${{ github.workspace }}/output.mp3
           retention-days: 5
 
+  test-understand:
+    runs-on: ubuntu-latest
+    name: Test Audio Understanding
+    needs: test
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Download generated audio from test job
+        uses: actions/download-artifact@v4
+        with:
+          name: generated-audio
+          path: ${{ github.workspace }}
+
+      - name: Analyze audio with ace-understand (dogfooding generated output)
+        id: analyze
+        uses: ./
+        with:
+          understand: '/github/workspace/output.mp3'
+
+      - name: Check understand output
+        run: |
+          echo "understand_result: ${{ steps.analyze.outputs.understand_result }}"
+          # Verify the output is non-empty valid JSON
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq .
+          # Verify expected metadata fields are present
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("caption")'
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("lyrics")'
+          echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("bpm")'
+
diff --git a/Dockerfile b/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update && \
         libopenblas-dev \
         python3-pip \
         jq \
+        curl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/*
 
@@ -33,7 +34,7 @@ RUN git clone --depth 1 --recurse-submodules \
     cmake --install . --prefix /usr/local && \
     ldconfig && \
     mkdir -p /action/bin && \
-    cp ace-qwen3 dit-vae ace-undestand /action/bin/ && \
+    cp ace-qwen3 dit-vae ace-understand /action/bin/ && \
     cd / && rm -rf /tmp/acestep-cpp
 
 # ---------------------------------------------------------------------------

diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ The pre-built Docker image ships with compiled `ace-qwen3`/`dit-vae` binaries **
 
 - 🎵 Generate high-quality music from a text caption
 - 🖊️ Optional lyrics — or let the LLM write them for you
+- 🔍 Analyze existing audio with `ace-understand` — extract caption, lyrics, BPM, key, duration, and language
 - ⚡ Native C++17 / GGML engine — lightweight, no GPU required
 - 🐳 Pre-built Docker image with models included — zero download wait
 - 🎲 Reproducible generation with optional seed
@@ -66,6 +67,32 @@ jobs:
     path: ${{ steps.audio.outputs.audio_file }}
 ```
 
+### Analyze an existing audio file
+
+Supply a local file path or a URL (http/https) to an MP3 or WAV file via the `understand` input.  
+The action uses the file directly if a path is given, or downloads it first if a URL is provided.  
+It then runs `ace-understand` and returns the analysis as JSON in the `understand_result` output.  
+Audio generation is **skipped** when `understand` is set.
+
+```yaml
+# From a URL
+- name: Analyze audio (URL)
+  id: analyze
+  uses: audiohacking/acestep-action@main
+  with:
+    understand: 'https://example.com/song.mp3'
+
+# From a local path (e.g. a file already in the workspace)
+- name: Analyze audio (local file)
+  id: analyze
+  uses: audiohacking/acestep-action@main
+  with:
+    understand: '/github/workspace/output.wav'
+
+- name: Show analysis
+  run: echo '${{ steps.analyze.outputs.understand_result }}'
+```
+
 ## Inputs
 
 | Input | Description | Required | Default |
@@ -78,13 +105,15 @@ jobs:
 | `shift` | Flow-matching shift parameter | No | `3` |
 | `vocal_language` | Vocal language code (`en`, `fr`, …) | No | `en` |
 | `output_path` | Output path for the generated WAV file | No | `output.wav` |
+| `understand` | Local file path or URL (http/https) to an MP3 or WAV file to analyze (activates understand mode — skips generation) | No | _(empty)_ |
 
 ## Outputs
 
 | Output | Description |
 |--------|-------------|
 | `audio_file` | Path to the generated WAV audio file |
 | `generation_time` | Time taken to generate the audio in seconds |
+| `understand_result` | JSON from `ace-understand`: caption, lyrics, BPM, key, duration, language |
 
 ## How it works
 
@@ -94,17 +123,25 @@ The action runs as a **pre-built Docker container** published to GitHub Containe
 |------|---------------|
 | `ace-qwen3` binary (Qwen3 causal LM) | `/action/bin/ace-qwen3` |
 | `dit-vae` binary (DiT + Oobleck VAE) | `/action/bin/dit-vae` |
+| `ace-understand` binary (reverse pipeline) | `/action/bin/ace-understand` |
 | `Qwen3-Embedding-0.6B-Q8_0.gguf` | `/action/models/` |
 | `acestep-5Hz-lm-4B-Q8_0.gguf` | `/action/models/` |
 | `acestep-v15-turbo-Q8_0.gguf` | `/action/models/` |
 | `vae-BF16.gguf` | `/action/models/` |
 
 At runtime the entrypoint (`src/entrypoint.sh`):
+
+**Generation mode** (default — when `understand` is not set):
 1. Builds a request JSON from inputs
 2. Runs `ace-qwen3` (LLM stage: caption → enriched JSON with lyrics + audio codes)
 3. Runs `dit-vae` (DiT + VAE stage: JSON → stereo 48 kHz WAV)
 4. Moves the output WAV to the requested path in `$GITHUB_WORKSPACE`
 
+**Understand mode** (when `understand` is provided):
+1. If a URL (http/https/ftp/file) is given, downloads the audio file; if a local path is given, uses it directly
+2. Runs `ace-understand` (VAE encode → FSQ tokenize → LM understand → JSON)
+3. Emits the resulting JSON as the `understand_result` action output
+
 **Image location:** `ghcr.io/audiohacking/acestep-action:latest`
 
 ## Project structure

diff --git a/action.yml b/action.yml
@@ -39,12 +39,18 @@ inputs:
     description: 'Output path for the generated WAV file (relative to workspace or absolute)'
     required: false
     default: 'output.wav'
+  understand:
+    description: 'Local file path or URL (http/https) to an MP3 or WAV audio file to analyze with ace-understand (activates understand mode — skips audio generation)'
+    required: false
+    default: ''
 
 outputs:
   audio_file:
     description: 'Path to the generated WAV audio file'
   generation_time:
     description: 'Time taken to generate the audio in seconds'
+  understand_result:
+    description: 'JSON output from ace-understand containing caption, lyrics, bpm, key, duration, and language analysis of the provided audio'
 
 runs:
   using: 'docker'

diff --git a/src/entrypoint.sh b/src/entrypoint.sh
@@ -23,6 +23,7 @@ INFERENCE_STEPS="${INPUT_INFERENCE_STEPS:-8}"
 SHIFT="${INPUT_SHIFT:-3}"
 VOCAL_LANGUAGE="${INPUT_VOCAL_LANGUAGE:-en}"
 OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}"
+UNDERSTAND="${INPUT_UNDERSTAND:-}"
 
 # ---------------------------------------------------------------------------
 # Fixed in-image paths
@@ -31,6 +32,7 @@ OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}"
 MODEL_DIR="/action/models"
 ACE_QWEN3="/action/bin/ace-qwen3"
 DIT_VAE="/action/bin/dit-vae"
+ACE_UNDERSTAND="/action/bin/ace-understand"
 
 # ---------------------------------------------------------------------------
 # Validate binaries
@@ -45,6 +47,84 @@ if [ ! -x "$DIT_VAE" ]; then
     exit 1
 fi
 
+# ---------------------------------------------------------------------------
+# Understand mode — download audio and run ace-understand, then exit
+# ---------------------------------------------------------------------------
+
+if [ -n "${UNDERSTAND}" ]; then
+    if [ ! -x "$ACE_UNDERSTAND" ]; then
+        echo "Error: ace-understand binary not found at $ACE_UNDERSTAND" >&2
+        exit 1
+    fi
+
+    WORK_DIR=$(mktemp -d)
+    trap 'rm -rf "$WORK_DIR"' EXIT
+
+    echo "=== ace-understand mode ==="
+    echo "UNDERSTAND=${UNDERSTAND}"
+
+    # Determine whether the input is a URL (requires curl) or a local path
+    case "${UNDERSTAND}" in
+        http://*|https://*|ftp://*|file://*)
+            # URL — derive a local filename from the extension and download
+            case "${UNDERSTAND}" in
+                *.mp3|*.MP3) AUDIO_FILE="$WORK_DIR/input.mp3" ;;
+                *)           AUDIO_FILE="$WORK_DIR/input.mp3" ;;
+            esac
+            echo ""
+            echo "=== Downloading audio ==="
+            curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}"
+            if [ ! -s "${AUDIO_FILE}" ]; then
+                echo "Error: downloaded audio file is missing or empty at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            echo "Downloaded: $(ls -lh "${AUDIO_FILE}")"
+            ;;
+        *)
+            # Local path — use directly (no download needed)
+            AUDIO_FILE="${UNDERSTAND}"
+            if [ ! -f "${AUDIO_FILE}" ]; then
+                echo "Error: local audio file not found at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            if [ ! -s "${AUDIO_FILE}" ]; then
+                echo "Error: local audio file is empty at ${AUDIO_FILE}" >&2
+                exit 1
+            fi
+            echo "Using local file: $(ls -lh "${AUDIO_FILE}")"
+            ;;
+    esac
+
+    UNDERSTAND_OUTPUT="$WORK_DIR/understand_result.json"
+
+    echo ""
+    echo "=== Running ace-understand ==="
+    "$ACE_UNDERSTAND" \
+        --src-audio "${AUDIO_FILE}" \
+        --dit       "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \
+        --vae       "$MODEL_DIR/vae-BF16.gguf" \
+        --model     "$MODEL_DIR/acestep-5Hz-lm-4B-Q8_0.gguf" \
+        -o          "${UNDERSTAND_OUTPUT}"
+
+    echo ""
+    echo "=== ace-understand result ==="
+    cat "${UNDERSTAND_OUTPUT}"
+
+    # Set GitHub Actions output (multiline-safe heredoc with random delimiter)
+    if [ -n "${GITHUB_OUTPUT:-}" ]; then
+        DELIM="EOF_$$_${RANDOM}"
+        {
+            echo "understand_result<<${DELIM}"
+            cat "${UNDERSTAND_OUTPUT}"
+            echo "${DELIM}"
+        } >> "$GITHUB_OUTPUT"
+    fi
+
+    echo ""
+    echo "=== ace-understand complete ==="
+    exit 0
+fi
+
 # ---------------------------------------------------------------------------
 # Resolve output path: relative paths are relative to $GITHUB_WORKSPACE
 # ---------------------------------------------------------------------------
@@ -53,7 +133,7 @@ WORKSPACE_ROOT="${GITHUB_WORKSPACE:-/github/workspace}"
 
 # Default to workspace root if not specified
 if [ -z "$OUTPUT_PATH" ]; then
-    OUTPUT_PATH="${WORKSPACE_ROOT}/output.wav"
+    OUTPUT_PATH="${WORKSPACE_ROOT}/output.mp3"
 elif [[ "$OUTPUT_PATH" != /* ]]; then
     OUTPUT_PATH="${WORKSPACE_ROOT}/${OUTPUT_PATH}"
 fi
@@ -71,6 +151,7 @@ echo "INFERENCE_STEPS=${INFERENCE_STEPS}"
 echo "SHIFT=${SHIFT}"
 echo "VOCAL_LANGUAGE=${VOCAL_LANGUAGE}"
 echo "OUTPUT_PATH=${OUTPUT_PATH}"
+echo "UNDERSTAND=${UNDERSTAND}"
 echo "WORKSPACE_ROOT=${WORKSPACE_ROOT}"
 echo "GITHUB_WORKSPACE=${GITHUB_WORKSPACE:-<unset>}"
 
@@ -134,7 +215,7 @@ echo "=== Stage 1: ace-qwen3 (LLM) ==="
 REQUEST0_FILE="${REQUEST_FILE%.json}0.json"
 
 # ---------------------------------------------------------------------------
-# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.wav
+# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.mp3
 # ---------------------------------------------------------------------------
 
 echo ""
@@ -145,8 +226,8 @@ echo "=== Stage 2: dit-vae (DiT + VAE) ==="
     --dit          "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \
     --vae          "$MODEL_DIR/vae-BF16.gguf"
 
-# dit-vae writes requestN0.wav alongside the request0.json file
-OUTPUT_WAV="${REQUEST0_FILE%.json}0.wav"
+# dit-vae writes requestN0.mp3 alongside the request0.json file
+OUTPUT_WAV="${REQUEST0_FILE%.json}0.mp3"
 
 echo "=== Directory listings (pre-move) ==="
 echo "WORK_DIR=${WORK_DIR}"
@@ -191,7 +272,7 @@ fi
 # ---------------------------------------------------------------------------
 
 ACTIONS_WORKSPACE="/github/workspace"
-ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.wav"
+ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.mp3"
 
 echo "=== Directory listings (pre-copy) ==="
 ls -lh "$ACTIONS_WORKSPACE" || echo "(ls ${ACTIONS_WORKSPACE} failed)"
@@ -228,7 +309,7 @@ fi
 # ---------------------------------------------------------------------------
 
 if [ -n "${GITHUB_OUTPUT:-}" ]; then
-    # Always point to /github/workspace/output.wav so subsequent steps can
+    # Always point to /github/workspace/output.mp3 so subsequent steps can
     # reliably access the file regardless of what output_path was specified.
     echo "audio_file=${ACTIONS_OUTPUT}" >> "$GITHUB_OUTPUT"
     echo "generation_time=${GENERATION_TIME}" >> "$GITHUB_OUTPUT"