diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 88f1c1a..3a8781d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,18 +27,13 @@ jobs: lyrics: '' duration: '5' inference_steps: '4' - output_path: 'output.wav' + output_path: 'output.mp3' - name: Check output run: | echo "Generated audio file: ${{ steps.generate.outputs.audio_file }}" echo "Generation time: ${{ steps.generate.outputs.generation_time }} seconds" - ls -lh "${{ github.workspace }}/output.wav" - - - name: Debug - verify workspace file presence and format - run: | - ls -lh "${{ github.workspace }}/output.wav" - cat "${{ github.workspace }}/output.wav" | head -c 44 | base64 + ls -lh "${{ github.workspace }}/output.mp3" - name: List workspace before upload run: ls -lh ${{ github.workspace }} @@ -47,6 +42,37 @@ jobs: uses: actions/upload-artifact@v4 with: name: generated-audio - path: ${{ github.workspace }}/output.wav + path: ${{ github.workspace }}/output.mp3 retention-days: 5 + test-understand: + runs-on: ubuntu-latest + name: Test Audio Understanding + needs: test + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download generated audio from test job + uses: actions/download-artifact@v4 + with: + name: generated-audio + path: ${{ github.workspace }} + + - name: Analyze audio with ace-understand (dogfooding generated output) + id: analyze + uses: ./ + with: + understand: '/github/workspace/output.mp3' + + - name: Check understand output + run: | + echo "understand_result: ${{ steps.analyze.outputs.understand_result }}" + # Verify the output is non-empty valid JSON + echo '${{ steps.analyze.outputs.understand_result }}' | jq . + # Verify expected metadata fields are present + echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("caption")' + echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("lyrics")' + echo '${{ steps.analyze.outputs.understand_result }}' | jq -e 'has("bpm")' + diff --git a/Dockerfile b/Dockerfile index e97c855..0d26d97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ RUN apt-get update && \ libopenblas-dev \ python3-pip \ jq \ + curl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* @@ -33,7 +34,7 @@ RUN git clone --depth 1 --recurse-submodules \ cmake --install . --prefix /usr/local && \ ldconfig && \ mkdir -p /action/bin && \ - cp ace-qwen3 dit-vae ace-undestand /action/bin/ && \ + cp ace-qwen3 dit-vae ace-understand /action/bin/ && \ cd / && rm -rf /tmp/acestep-cpp # --------------------------------------------------------------------------- diff --git a/README.md b/README.md index 5ce34c0..6f56a87 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ The pre-built Docker image ships with compiled `ace-qwen3`/`dit-vae` binaries ** - 🎵 Generate high-quality music from a text caption - 🖊️ Optional lyrics — or let the LLM write them for you +- 🔍 Analyze existing audio with `ace-understand` — extract caption, lyrics, BPM, key, duration, and language - ⚡ Native C++17 / GGML engine — lightweight, no GPU required - 🐳 Pre-built Docker image with models included — zero download wait - 🎲 Reproducible generation with optional seed @@ -66,6 +67,32 @@ jobs: path: ${{ steps.audio.outputs.audio_file }} ``` +### Analyze an existing audio file + +Supply a local file path or a URL (http/https) to an MP3 or WAV file via the `understand` input. +The action uses the file directly if a path is given, or downloads it first if a URL is provided. +It then runs `ace-understand` and returns the analysis as JSON in the `understand_result` output. +Audio generation is **skipped** when `understand` is set. + +```yaml +# From a URL +- name: Analyze audio (URL) + id: analyze + uses: audiohacking/acestep-action@main + with: + understand: 'https://example.com/song.mp3' + +# From a local path (e.g. a file already in the workspace) +- name: Analyze audio (local file) + id: analyze + uses: audiohacking/acestep-action@main + with: + understand: '/github/workspace/output.wav' + +- name: Show analysis + run: echo '${{ steps.analyze.outputs.understand_result }}' +``` + ## Inputs | Input | Description | Required | Default | @@ -78,6 +105,7 @@ jobs: | `shift` | Flow-matching shift parameter | No | `3` | | `vocal_language` | Vocal language code (`en`, `fr`, …) | No | `en` | | `output_path` | Output path for the generated WAV file | No | `output.wav` | +| `understand` | Local file path or URL (http/https) to an MP3 or WAV file to analyze (activates understand mode — skips generation) | No | _(empty)_ | ## Outputs @@ -85,6 +113,7 @@ jobs: |--------|-------------| | `audio_file` | Path to the generated WAV audio file | | `generation_time` | Time taken to generate the audio in seconds | +| `understand_result` | JSON from `ace-understand`: caption, lyrics, BPM, key, duration, language | ## How it works @@ -94,17 +123,25 @@ The action runs as a **pre-built Docker container** published to GitHub Containe |------|---------------| | `ace-qwen3` binary (Qwen3 causal LM) | `/action/bin/ace-qwen3` | | `dit-vae` binary (DiT + Oobleck VAE) | `/action/bin/dit-vae` | +| `ace-understand` binary (reverse pipeline) | `/action/bin/ace-understand` | | `Qwen3-Embedding-0.6B-Q8_0.gguf` | `/action/models/` | | `acestep-5Hz-lm-4B-Q8_0.gguf` | `/action/models/` | | `acestep-v15-turbo-Q8_0.gguf` | `/action/models/` | | `vae-BF16.gguf` | `/action/models/` | At runtime the entrypoint (`src/entrypoint.sh`): + +**Generation mode** (default — when `understand` is not set): 1. Builds a request JSON from inputs 2. Runs `ace-qwen3` (LLM stage: caption → enriched JSON with lyrics + audio codes) 3. Runs `dit-vae` (DiT + VAE stage: JSON → stereo 48 kHz WAV) 4. Moves the output WAV to the requested path in `$GITHUB_WORKSPACE` +**Understand mode** (when `understand` is provided): +1. If a URL (http/https/ftp/file) is given, downloads the audio file; if a local path is given, uses it directly +2. Runs `ace-understand` (VAE encode → FSQ tokenize → LM understand → JSON) +3. Emits the resulting JSON as the `understand_result` action output + **Image location:** `ghcr.io/audiohacking/acestep-action:latest` ## Project structure diff --git a/action.yml b/action.yml index 91973db..ba344a5 100644 --- a/action.yml +++ b/action.yml @@ -39,12 +39,18 @@ inputs: description: 'Output path for the generated WAV file (relative to workspace or absolute)' required: false default: 'output.wav' + understand: + description: 'Local file path or URL (http/https) to an MP3 or WAV audio file to analyze with ace-understand (activates understand mode — skips audio generation)' + required: false + default: '' outputs: audio_file: description: 'Path to the generated WAV audio file' generation_time: description: 'Time taken to generate the audio in seconds' + understand_result: + description: 'JSON output from ace-understand containing caption, lyrics, bpm, key, duration, and language analysis of the provided audio' runs: using: 'docker' diff --git a/src/entrypoint.sh b/src/entrypoint.sh index 21b63a9..fc3e5bc 100644 --- a/src/entrypoint.sh +++ b/src/entrypoint.sh @@ -23,6 +23,7 @@ INFERENCE_STEPS="${INPUT_INFERENCE_STEPS:-8}" SHIFT="${INPUT_SHIFT:-3}" VOCAL_LANGUAGE="${INPUT_VOCAL_LANGUAGE:-en}" OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}" +UNDERSTAND="${INPUT_UNDERSTAND:-}" # --------------------------------------------------------------------------- # Fixed in-image paths @@ -31,6 +32,7 @@ OUTPUT_PATH="${INPUT_OUTPUT_PATH:-}" MODEL_DIR="/action/models" ACE_QWEN3="/action/bin/ace-qwen3" DIT_VAE="/action/bin/dit-vae" +ACE_UNDERSTAND="/action/bin/ace-understand" # --------------------------------------------------------------------------- # Validate binaries @@ -45,6 +47,84 @@ if [ ! -x "$DIT_VAE" ]; then exit 1 fi +# --------------------------------------------------------------------------- +# Understand mode — download audio and run ace-understand, then exit +# --------------------------------------------------------------------------- + +if [ -n "${UNDERSTAND}" ]; then + if [ ! -x "$ACE_UNDERSTAND" ]; then + echo "Error: ace-understand binary not found at $ACE_UNDERSTAND" >&2 + exit 1 + fi + + WORK_DIR=$(mktemp -d) + trap 'rm -rf "$WORK_DIR"' EXIT + + echo "=== ace-understand mode ===" + echo "UNDERSTAND=${UNDERSTAND}" + + # Determine whether the input is a URL (requires curl) or a local path + case "${UNDERSTAND}" in + http://*|https://*|ftp://*|file://*) + # URL — derive a local filename from the extension and download + case "${UNDERSTAND}" in + *.mp3|*.MP3) AUDIO_FILE="$WORK_DIR/input.mp3" ;; + *) AUDIO_FILE="$WORK_DIR/input.mp3" ;; + esac + echo "" + echo "=== Downloading audio ===" + curl -fsSL --max-time 300 -o "${AUDIO_FILE}" "${UNDERSTAND}" + if [ ! -s "${AUDIO_FILE}" ]; then + echo "Error: downloaded audio file is missing or empty at ${AUDIO_FILE}" >&2 + exit 1 + fi + echo "Downloaded: $(ls -lh "${AUDIO_FILE}")" + ;; + *) + # Local path — use directly (no download needed) + AUDIO_FILE="${UNDERSTAND}" + if [ ! -f "${AUDIO_FILE}" ]; then + echo "Error: local audio file not found at ${AUDIO_FILE}" >&2 + exit 1 + fi + if [ ! -s "${AUDIO_FILE}" ]; then + echo "Error: local audio file is empty at ${AUDIO_FILE}" >&2 + exit 1 + fi + echo "Using local file: $(ls -lh "${AUDIO_FILE}")" + ;; + esac + + UNDERSTAND_OUTPUT="$WORK_DIR/understand_result.json" + + echo "" + echo "=== Running ace-understand ===" + "$ACE_UNDERSTAND" \ + --src-audio "${AUDIO_FILE}" \ + --dit "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \ + --vae "$MODEL_DIR/vae-BF16.gguf" \ + --model "$MODEL_DIR/acestep-5Hz-lm-4B-Q8_0.gguf" \ + -o "${UNDERSTAND_OUTPUT}" + + echo "" + echo "=== ace-understand result ===" + cat "${UNDERSTAND_OUTPUT}" + + # Set GitHub Actions output (multiline-safe heredoc with random delimiter) + if [ -n "${GITHUB_OUTPUT:-}" ]; then + DELIM="EOF_$$_${RANDOM}" + { + echo "understand_result<<${DELIM}" + cat "${UNDERSTAND_OUTPUT}" + echo "${DELIM}" + } >> "$GITHUB_OUTPUT" + fi + + echo "" + echo "=== ace-understand complete ===" + exit 0 +fi + # --------------------------------------------------------------------------- # Resolve output path: relative paths are relative to $GITHUB_WORKSPACE # --------------------------------------------------------------------------- @@ -53,7 +133,7 @@ WORKSPACE_ROOT="${GITHUB_WORKSPACE:-/github/workspace}" # Default to workspace root if not specified if [ -z "$OUTPUT_PATH" ]; then - OUTPUT_PATH="${WORKSPACE_ROOT}/output.wav" + OUTPUT_PATH="${WORKSPACE_ROOT}/output.mp3" elif [[ "$OUTPUT_PATH" != /* ]]; then OUTPUT_PATH="${WORKSPACE_ROOT}/${OUTPUT_PATH}" fi @@ -71,6 +151,7 @@ echo "INFERENCE_STEPS=${INFERENCE_STEPS}" echo "SHIFT=${SHIFT}" echo "VOCAL_LANGUAGE=${VOCAL_LANGUAGE}" echo "OUTPUT_PATH=${OUTPUT_PATH}" +echo "UNDERSTAND=${UNDERSTAND}" echo "WORKSPACE_ROOT=${WORKSPACE_ROOT}" echo "GITHUB_WORKSPACE=${GITHUB_WORKSPACE:-}" @@ -134,7 +215,7 @@ echo "=== Stage 1: ace-qwen3 (LLM) ===" REQUEST0_FILE="${REQUEST_FILE%.json}0.json" # --------------------------------------------------------------------------- -# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.wav +# Stage 2 — DiT + VAE: synthesises stereo 48 kHz WAV → request00.mp3 # --------------------------------------------------------------------------- echo "" @@ -145,8 +226,8 @@ echo "=== Stage 2: dit-vae (DiT + VAE) ===" --dit "$MODEL_DIR/acestep-v15-turbo-Q8_0.gguf" \ --vae "$MODEL_DIR/vae-BF16.gguf" -# dit-vae writes requestN0.wav alongside the request0.json file -OUTPUT_WAV="${REQUEST0_FILE%.json}0.wav" +# dit-vae writes requestN0.mp3 alongside the request0.json file +OUTPUT_WAV="${REQUEST0_FILE%.json}0.mp3" echo "=== Directory listings (pre-move) ===" echo "WORK_DIR=${WORK_DIR}" @@ -191,7 +272,7 @@ fi # --------------------------------------------------------------------------- ACTIONS_WORKSPACE="/github/workspace" -ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.wav" +ACTIONS_OUTPUT="${ACTIONS_WORKSPACE}/output.mp3" echo "=== Directory listings (pre-copy) ===" ls -lh "$ACTIONS_WORKSPACE" || echo "(ls ${ACTIONS_WORKSPACE} failed)" @@ -228,7 +309,7 @@ fi # --------------------------------------------------------------------------- if [ -n "${GITHUB_OUTPUT:-}" ]; then - # Always point to /github/workspace/output.wav so subsequent steps can + # Always point to /github/workspace/output.mp3 so subsequent steps can # reliably access the file regardless of what output_path was specified. echo "audio_file=${ACTIONS_OUTPUT}" >> "$GITHUB_OUTPUT" echo "generation_time=${GENERATION_TIME}" >> "$GITHUB_OUTPUT"