Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
8d0f709
feat(skills): add tts-voiceover skill for Azure Speech SDK voice-over…
auyidi1 Apr 20, 2026
24fe74b
fix(skills): address PR review findings for tts-voiceover skill
auyidi1 Apr 21, 2026
91a5233
fix(skills): resolve second PR review for tts-voiceover skill
auyidi1 Apr 21, 2026
456eaf3
style(skills): format long lines in tts-voiceover Python scripts
auyidi1 Apr 21, 2026
84f00eb
fix(skills): revert external formatter and apply ruff format
auyidi1 Apr 21, 2026
6df83d1
fix(skills): add pytest-cov, fuzz corpus, and fix plugin freshness
auyidi1 Apr 21, 2026
711f37e
fix(skills): add Copilot footer to tts-voiceover SKILL.md
auyidi1 Apr 21, 2026
22dcc91
fix(skills): pin dependency versions and fix lxml CVE-2026-41066
auyidi1 Apr 21, 2026
74838c8
fix(skills): address all PR review findings for tts-voiceover skill
auyidi1 Apr 21, 2026
af868c3
fix(ci): allow certifi and charset-normalizer licenses in dependency …
auyidi1 Apr 21, 2026
c01dbbf
fix(skills): restore required Copilot footer in tts-voiceover SKILL.md
auyidi1 Apr 21, 2026
3cc7b1e
fix(skills): address final PR review comments for tts-voiceover
auyidi1 Apr 22, 2026
482565b
feat(skills): add docs and Pester tests for tts-voiceover skill
auyidi1 Apr 23, 2026
bf3ab3d
fix(skills): address PR review comments for tts-voiceover skill
auyidi1 Apr 24, 2026
c0b4e4f
fix(skills): add script name and purpose headers to PS1 wrappers
auyidi1 Apr 24, 2026
4412f6c
fix(skills): fix markdown lint and ruff line-length violations
auyidi1 Apr 25, 2026
d432388
fix(skills): apply ruff format and table formatting for CI
auyidi1 Apr 25, 2026
2e0455b
fix(skills): add narration timing to embed_audio for PowerPoint video…
auyidi1 Apr 27, 2026
1eecef2
fix(skills): address PR review findings for tts-voiceover
Apr 29, 2026
4e7ed66
fix(collections): add missing maturity: experimental for vscode-playw…
Apr 29, 2026
2973bcd
fix(docs): use pathname:// protocol for out-of-scope SKILL.md link
Apr 29, 2026
d4c2bc6
chore(plugins): regenerate plugins after collection maturity updates
Apr 29, 2026
4479aff
fix(skills): add missing Copilot footer to tts-voiceover SKILL.md
Apr 29, 2026
b0b136e
style(docs): fix markdown table formatting in tts-voiceover guide
Apr 29, 2026
7bd1052
chore(plugins): regenerate plugin READMEs after SKILL.md footer update
Apr 29, 2026
cc54806
Merge branch 'main' into users/auyidi/tts-voiceover-skill
auyidi1 Apr 30, 2026
022c725
fix(skills): address latest PR review for tts-voiceover
Apr 30, 2026
171cb0f
test(skills): add test_embed_audio.py for tts-voiceover skill
Apr 30, 2026
7c28ef5
fix(skills): address final review items for tts-voiceover
Apr 30, 2026
d7c1b83
style(skills): rename test methods to BDD format per python-test conv…
Apr 30, 2026
e987592
fix(skills): correct buffer comment and tighten assertion in test_sho…
Apr 30, 2026
e56b4a2
fix(skills): add pytest-mock and migrate to mocker fixture
Apr 30, 2026
259a19f
style(skills): move #Requires after copyright headers per PS conventions
Apr 30, 2026
d167b6a
test(skills): add test_generate_voiceover.py for tts-voiceover skill
Apr 30, 2026
1bc89da
fix(skills): log exception type in embed_slide_audio catch block
Apr 30, 2026
c89fb96
fix(skills): remove non-standard metadata fields from SKILL.md frontm…
Apr 30, 2026
05502c5
fix(skills): return EXIT_FAILURE when audio synthesis fails for any s…
Apr 30, 2026
3761c67
fix(skills): address final review items for tts-voiceover
Apr 30, 2026
b934e49
docs(skills): add input contract and lexicon constraint to apply_acro…
Apr 30, 2026
ec423ea
fix(skills): return False when audio shape not found, add type-safe l…
Apr 30, 2026
ebdb8dd
fix(skills): move Copilot footer above attribution so attribution is …
Apr 30, 2026
db59e6d
fix(skills): XML-escape fuzz inputs, use Slide type hint, remove unus…
Apr 30, 2026
0c84590
fix(skills): add ValidateNotNullOrEmpty, fix Pester skip, cache regex
Apr 30, 2026
79b35f1
style(skills): remove non-standard module-level synopsis block from T…
Apr 30, 2026
204fdd6
fix(skills): fix sidebar_position collision and add AAA test structure
Apr 30, 2026
19f5d0c
fix(skills): wrap token refresh in try/except, fix OutputType convention
Apr 30, 2026
6086ee8
refactor(skills): co-locate Pester test inside tts-voiceover skill pa…
Apr 30, 2026
617550a
docs(skills): clarify lxml is a direct and transitive dependency
Apr 30, 2026
52899f2
fix(skills): align embed_audio exit code with generate_voiceover on p…
Apr 30, 2026
aa42509
Merge branch 'main' into users/auyidi/tts-voiceover-skill
auyidi1 May 1, 2026
ea52f30
fix(skills): address PR review feedback for tts-voiceover
May 1, 2026
528baf2
fix(skills): address additional tts-voiceover review feedback
May 1, 2026
d3a05ea
fix(skills): clean up orphaned audio shape and reorder _run before main
May 1, 2026
27b54da
fix(skills): address CodeQL finding and review feedback
May 1, 2026
35a4721
refactor(skills): capture add_movie() return value, remove _find_audi…
May 1, 2026
95a3b0a
fix(skills): address review feedback — license, types, tests, guard
May 1, 2026
92aedea
refactor(skills): extract configure_logging in embed_audio.py
May 1, 2026
f524234
fix(skills): wire verbose flag, replace assert, explicit Mandatory
May 1, 2026
0f819d5
refactor(skills): extract configure_logging and add --verbose to gene…
May 1, 2026
9fb9d1a
fix(skills): add diagnostic log when no audio files are embedded
May 2, 2026
6e31263
Merge branch 'main' into users/auyidi/tts-voiceover-skill
auyidi1 May 4, 2026
1343bc9
fix(skills): address tts-voiceover review feedback round 11
May 4, 2026
f94e8ab
fix(skills): address tts-voiceover review feedback round 12
May 4, 2026
fbb38bf
fix(skills): address tts-voiceover review feedback round 13
May 4, 2026
1363016
fix(skills): drop unused xmlns:a and set advClick=0 for audio-driven …
May 4, 2026
701d1c9
fix(skills): add word boundaries to acronym regex to prevent partial …
May 4, 2026
c892868
fix(skills): move configure_logging before _run and document --verbos…
May 4, 2026
d4a5857
fix(skills): clean up partial WAV on failure and use static timing XM…
May 5, 2026
5986338
fix(skills): add defensive warnings for missing timing template elements
May 5, 2026
9adda51
fix(skills): move timing template to module level and tighten shape_i…
May 5, 2026
f8c4ccc
fix(skills): guard credential None at token refresh and use WAV file …
May 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cspell/general-technical.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1595,3 +1595,4 @@ LASTEXITCODE
scriptblock
DSSE
intoto
SSML
184 changes: 184 additions & 0 deletions .github/skills/experimental/tts-voiceover/SKILL.md
Comment thread
auyidi1 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
---
name: tts-voiceover
description: 'Text-to-speech voice-over generation from YAML speaker notes using Azure Speech SDK with SSML pronunciation control - Brought to you by microsoft/hve-core'
metadata:
authors: "microsoft/hve-core"
spec_version: "1.0"
---
Comment thread
auyidi1 marked this conversation as resolved.

# TTS Voice Over Skill

Generates per-slide WAV voice-over files from YAML `speaker_notes` using Azure Speech SDK with SSML pronunciation control.

## Overview

This skill reads `content.yaml` files from a PowerPoint skill content directory, extracts `speaker_notes` fields, applies SSML acronym aliases for correct pronunciation of technical terms, and produces one WAV file per slide. Supports dry-run mode for SSML template verification without Azure credentials.

## Prerequisites

* **Azure Speech resource** — Free tier provides 500K characters per month.
* **Authentication** — Key-based (`SPEECH_KEY`) or Microsoft Entra ID (`SPEECH_RESOURCE_ID`).
* **Python 3.11+** with `uv` for virtual environment management.

### Key-Based Auth

```bash
export SPEECH_KEY="your-speech-key"
export SPEECH_REGION="eastus"
```

### Microsoft Entra ID Auth

Requires a custom domain on the Speech resource and `Cognitive Services Speech User` role.

```bash
Comment thread
auyidi1 marked this conversation as resolved.
export SPEECH_RESOURCE_ID="/subscriptions/.../Microsoft.CognitiveServices/accounts/your-resource"
export SPEECH_REGION="eastus"
```

Install dependencies:

```bash
# run from this skill folder
uv sync
```

## Quick Start

Verify SSML templates without generating audio:

```bash
uv run scripts/generate_voiceover.py --dry-run --content-dir path/to/content
```

Generate voice-over WAV files:

```bash
uv run scripts/generate_voiceover.py --content-dir path/to/content --output-dir voice-over
```

Embed audio into a PPTX deck:

```bash
uv run scripts/embed_audio.py --input deck.pptx --audio-dir voice-over --output deck-narrated.pptx
```

## Parameters Reference

### generate_voiceover.py

| Parameter | Type | Default | Description |
|:----------------|:-------|:------------------------------------|:----------------------------------------------|
| `--dry-run` | flag | `false` | Print SSML templates without generating audio |
| `--voice` | string | `en-US-Andrew:DragonHDLatestNeural` | Azure TTS voice name |
| `--rate` | string | `+10%` | Speech prosody rate |
| `--content-dir` | path | `content` | Path to slide content directory |
| `--output-dir` | path | `voice-over` | Path to WAV output directory |
| `--lexicon` | path | *(auto-detect)* | Custom acronyms.yaml path |
Comment thread
auyidi1 marked this conversation as resolved.
| `--verbose` / `-v` | flag | `false` | Enable verbose (DEBUG) logging output |

### embed_audio.py

Embeds WAV files into corresponding PPTX slides and adds narration timing
XML so PowerPoint recognizes the audio for video export via
**File > Export > Create a Video > Use Recorded Timings and Narrations**.

| Parameter | Type | Default | Description |
|:--------------|:-----|:------------------|:-----------------------------|
| `--input` | path | *(required)* | Source PPTX file path |
| `--audio-dir` | path | `voice-over` | Directory with slide-NNN.wav |
| `--output` | path | `*-narrated.pptx` | Output PPTX file path |
| `--verbose` / `-v` | flag | `false` | Enable verbose (DEBUG) logging output |

## Script Reference
Comment thread
auyidi1 marked this conversation as resolved.

Generate with custom voice and rate:

```bash
uv run scripts/generate_voiceover.py \
--content-dir content \
--output-dir voice-over \
--voice "en-US-Jenny:DragonHDLatestNeural" \
--rate "+5%"
```

Use a custom lexicon:

```bash
uv run scripts/generate_voiceover.py \
--content-dir content \
--lexicon custom-acronyms.yaml
```

Embed generated audio:

```bash
uv run scripts/embed_audio.py \
--input slide-deck/presentation.pptx \
--audio-dir voice-over \
--output slide-deck/presentation-narrated.pptx
```

## Acronym Lexicon

The lexicon controls SSML `<sub alias>` replacements for acronyms and technical terms. Create an `acronyms.yaml` file:

```yaml
acronyms:
HVE-Core: "H V E Core"
OWASP: "Oh wasp"
SBOM: "S Bomb"
SLSA: "Salsa"
CI/CD: "C I C D"
```

Lexicon resolution order:

1. Path specified via `--lexicon` argument.
2. `acronyms.yaml` in the content directory.
3. Built-in defaults covering common technical acronyms.

## SSML Template

Each slide produces an SSML document:

```xml
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="en-US-Andrew:DragonHDLatestNeural">
<prosody rate="+10%">
Text with <sub alias="Oh wasp">OWASP</sub> aliases applied.
</prosody>
</voice>
</speak>
```

## Integration with PowerPoint Skill

This skill reads from the PowerPoint skill's content directory structure:

```text
content/
├── slide-001/
│ └── content.yaml # Must include speaker_notes: field
├── slide-002/
│ └── content.yaml
└── ...
```

Each `content.yaml` should contain a `speaker_notes:` field with the narration text. The generated WAV files are named `slide-NNN.wav` matching the directory names.

## Troubleshooting

| Issue | Solution |
|:-----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------|
| `Set SPEECH_KEY ... or SPEECH_RESOURCE_ID` | Export `SPEECH_KEY` (key auth) or `SPEECH_RESOURCE_ID` (Entra ID) with `SPEECH_REGION`. |
| 401 with Entra ID auth | Verify custom domain on the Speech resource and `Cognitive Services Speech User` role. RBAC propagation takes up to 5 minutes. |
| Empty WAV files or skipped slides | Verify `speaker_notes:` is present and non-empty in `content.yaml`. |
| Mispronounced acronyms | Add entries to `acronyms.yaml` with phonetic aliases. |
| `azure-cognitiveservices-speech package is required` | Run `uv sync` in the skill directory. |
| Audio icon visible in PPTX | Reposition or resize the audio object in PowerPoint after embedding. |
| Authored slide animations missing after embedding | `embed_audio.py` replaces existing `p:timing` with narration timing; re-apply animations in PowerPoint after embedding audio. |
| Video export shows "No timings recorded" | Re-embed audio with the updated `embed_audio.py` which adds narration timing XML automatically. |

> Brought to you by microsoft/hve-core
34 changes: 34 additions & 0 deletions .github/skills/experimental/tts-voiceover/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[project]
name = "tts-voiceover-skill"
version = "0.0.0"
requires-python = ">=3.11"
Comment thread
auyidi1 marked this conversation as resolved.
dependencies = [
"azure-cognitiveservices-speech>=1.41",
"azure-identity>=1.19",
"lxml>=6.1.0", # direct dep (embed_audio.py) and transitive via python-pptx; explicit pin ensures CVE patches
"python-pptx>=1.0",
"pyyaml>=6.0",
]
Comment thread
auyidi1 marked this conversation as resolved.

[dependency-groups]
dev = [
"pytest>=9.0",
"pytest-cov>=5.0",
"pytest-mock>=3.14",
Comment thread
auyidi1 marked this conversation as resolved.
"ruff>=0.15",
]
Comment thread
auyidi1 marked this conversation as resolved.
fuzz = [
"atheris>=3.0",
]

[tool.pytest.ini_options]
testpaths = ["tests"]
pythonpath = ["scripts"]
python_files = ["test_*.py", "fuzz_harness.py"]

[tool.ruff]
line-length = 88
target-version = "py311"

[tool.ruff.lint]
select = ["E", "F", "I", "W"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env pwsh
Comment thread
auyidi1 marked this conversation as resolved.
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: MIT
#Requires -Version 7.0
#
# Invoke-EmbedAudio.ps1
#
# Purpose: Wrapper that manages uv venv setup and delegates to embed_audio.py

<#
.SYNOPSIS
Embeds per-slide WAV voice-over files into a PowerPoint deck.

.DESCRIPTION
Manages the Python virtual environment and invokes embed_audio.py to add
WAV files as embedded media objects in the corresponding slides of a PPTX file.

.PARAMETER InputPath
Source PPTX file path. Required.

.PARAMETER AudioDir
Directory containing slide-NNN.wav files. Defaults to voice-over.

.PARAMETER OutputPath
Output PPTX file path. Defaults to input stem + '-narrated.pptx'.

.PARAMETER SkipVenvSetup
Skip virtual environment creation and dependency installation.

.EXAMPLE
./Invoke-EmbedAudio.ps1 -InputPath deck.pptx -AudioDir voice-over

.EXAMPLE
./Invoke-EmbedAudio.ps1 -InputPath deck.pptx -AudioDir voice-over -OutputPath deck-narrated.pptx

.NOTES
Part of the tts-voiceover skill. Manages uv virtual environment setup
and delegates to embed_audio.py for WAV embedding into PPTX slides.
#>
Comment thread
auyidi1 marked this conversation as resolved.

[CmdletBinding()]
param(
[Parameter(Mandatory = $true)]
[ValidateNotNullOrEmpty()]
[string]$InputPath,
Comment thread
auyidi1 marked this conversation as resolved.

[Parameter(Mandatory = $false)]
[string]$AudioDir,

[Parameter(Mandatory = $false)]
[string]$OutputPath,

[Parameter(Mandatory = $false)]
[switch]$SkipVenvSetup
)

$ErrorActionPreference = 'Stop'

$ScriptDir = $PSScriptRoot
$SkillRoot = Split-Path $ScriptDir
$VenvDir = Join-Path $SkillRoot '.venv'

Import-Module (Join-Path $ScriptDir 'Modules/TtsVoiceoverHelpers.psm1') -Force

#region Main

Comment thread
auyidi1 marked this conversation as resolved.
if ($MyInvocation.InvocationName -ne '.') {

$null = Test-UvAvailability

Comment thread
auyidi1 marked this conversation as resolved.
if (-not $SkipVenvSetup) {
Initialize-PythonEnvironment -SkillRoot $SkillRoot
}

$python = Get-VenvPythonPath -VenvDir $VenvDir
if (-not (Test-Path $python)) {
throw "Python not found at $python. Run without -SkipVenvSetup to initialize."
}

$script = Join-Path $ScriptDir 'embed_audio.py'
$PythonArgs = @('--input', $InputPath)

Comment thread
auyidi1 marked this conversation as resolved.
if ($AudioDir) { $PythonArgs += '--audio-dir', $AudioDir }
if ($OutputPath) { $PythonArgs += '--output', $OutputPath }
if ($VerbosePreference -ne 'SilentlyContinue') { $PythonArgs += '--verbose' }

& $python $script @PythonArgs
if ($LASTEXITCODE -ne 0) {
throw "embed_audio.py exited with code $LASTEXITCODE"
}

}

#endregion Main
Loading
Loading