diff --git a/.github/workflows/skyeye.yaml b/.github/workflows/skyeye.yaml index 266efa44..dfb9fa8f 100644 --- a/.github/workflows/skyeye.yaml +++ b/.github/workflows/skyeye.yaml @@ -33,6 +33,26 @@ jobs: uses: ./.github/actions/setup - name: Test run: make test + integration-test: + name: Integration Test (Advisory) + needs: [lint, test] + runs-on: ubuntu-latest + timeout-minutes: 60 + continue-on-error: true + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup + uses: ./.github/actions/setup + - name: Cache AI models + uses: actions/cache@v4 + with: + path: models + key: ai-models-${{ hashFiles('pkg/recognizer/parakeet/model/version.go', 'pkg/synthesizer/pocket/model/version.go') }} + - name: Download models + run: CGO_ENABLED=0 go run ./cmd/download-models --dir models + - name: Integration Test + run: make integration-test build-linux-amd64: name: Build on Linux AMD64 runs-on: ubuntu-latest @@ -57,9 +77,17 @@ jobs: cp LICENSE dist/skyeye-linux-amd64/LICENSE cp config.yaml dist/skyeye-linux-amd64/config.yaml cp docs/*.md dist/skyeye-linux-amd64/docs/ + - name: Cache AI models + if: startsWith(github.ref, 'refs/tags/') + uses: actions/cache@v4 + with: + path: models + key: ai-models-${{ hashFiles('pkg/recognizer/parakeet/model/version.go', 'pkg/synthesizer/pocket/model/version.go') }} - name: Download models if: startsWith(github.ref, 'refs/tags/') - run: CGO_ENABLED=0 go run ./cmd/download-models --dir dist/skyeye-linux-amd64/models/parakeet + run: | + CGO_ENABLED=0 go run ./cmd/download-models --dir models + cp -r models dist/skyeye-linux-amd64/models - name: Create dist archive shell: bash run: tar -czf dist/skyeye-linux-amd64.tar.gz -C dist skyeye-linux-amd64 @@ -96,9 +124,17 @@ jobs: cp LICENSE dist/skyeye-macos-arm64/LICENSE cp config.yaml dist/skyeye-macos-arm64/config.yaml cp docs/*.md dist/skyeye-macos-arm64/docs/ + - name: Cache AI models + if: startsWith(github.ref, 'refs/tags/') + uses: actions/cache@v4 + with: + path: models + key: ai-models-${{ hashFiles('pkg/recognizer/parakeet/model/version.go', 'pkg/synthesizer/pocket/model/version.go') }} - name: Download models if: startsWith(github.ref, 'refs/tags/') - run: CGO_ENABLED=0 go run ./cmd/download-models --dir dist/skyeye-macos-arm64/models/parakeet + run: | + CGO_ENABLED=0 go run ./cmd/download-models --dir models + cp -r models dist/skyeye-macos-arm64/models - name: Create dist archive shell: bash run: tar -czf dist/skyeye-macos-arm64.tar.gz -C dist skyeye-macos-arm64 @@ -158,10 +194,18 @@ jobs: cp init/winsw/skyeye-service.yml dist/skyeye-windows-amd64/skyeye-service.yml cp winsw.exe dist/skyeye-windows-amd64/skyeye-scaler-service.exe cp init/winsw/skyeye-scaler-service.yml dist/skyeye-windows-amd64/skyeye-scaler-service.yml + - name: Cache AI models + if: startsWith(github.ref, 'refs/tags/') + uses: actions/cache@v4 + with: + path: models + key: ai-models-${{ hashFiles('pkg/recognizer/parakeet/model/version.go', 'pkg/synthesizer/pocket/model/version.go') }} - name: Download models if: startsWith(github.ref, 'refs/tags/') shell: msys2 {0} - run: CGO_ENABLED=0 go run ./cmd/download-models --dir dist/skyeye-windows-amd64/models/parakeet + run: | + CGO_ENABLED=0 go run ./cmd/download-models --dir models + cp -r models dist/skyeye-windows-amd64/models - name: Create dist archive shell: msys2 {0} run: | diff --git a/CLAUDE.md b/CLAUDE.md index 435335b5..3def3907 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,21 +2,21 @@ SkyEye is an AI-powered GCI bot for DCS World that uses Parakeet TDT speech recognition (via sherpa-onnx), Tacview telemetry, and TTS to replace in-game AWACS with natural language command processing following real-world aviation brevity codes. -**Stack:** Go 1.26 + CGO, sherpa-onnx (Parakeet TDT), Piper TTS (Windows/Linux), macOS Speech Synthesis, Tacview ACMI, SRS protocol +**Stack:** Go 1.26 + CGO, sherpa-onnx (Parakeet TDT + Pocket TTS), Tacview ACMI, SRS protocol ## Platform Support | Platform | Arch | Status | TTS | Linking | Runtime Deps | |----------|------|--------|-----|---------|--------------| -| **Windows** | AMD64 | ✅ | Piper (embedded) | Static | None - fully portable exe | -| **Linux** | AMD64 | ✅ | Piper (embedded) | Dynamic opus/soxr | libopus0, libsoxr0 | -| **macOS** | ARM64 | ✅ | System (Neural Engine) | Dynamic opus/soxr | Homebrew opus, libsoxr | +| **Windows** | AMD64 | ✅ | Pocket TTS (sherpa-onnx) | Static | None - fully portable exe | +| **Linux** | AMD64 | ✅ | Pocket TTS (sherpa-onnx) | Dynamic opus/soxr | libopus0, libsoxr0 | +| **macOS** | ARM64 | ✅ | Pocket TTS (sherpa-onnx) | Dynamic opus/soxr | Homebrew opus, libsoxr | | macOS Intel | AMD64 | ❌ | - | - | No test hardware | **Key Differences:** - **Windows:** MUST build in MSYS2 UCRT64 (not cmd/PowerShell), static linking, portable binary - **Linux:** Standard Unix build, requires runtime libraries, good for containers -- **macOS:** Uses Apple Clang (system compiler), `--use-system-voice` flag available +- **macOS:** Uses Apple Clang (system compiler) - **Cross-compilation:** Not supported - must build on target platform ## Critical: Use Make, Not Go Commands @@ -52,18 +52,19 @@ pkg/ - Public APIs recognizer/ - Speech recognition (Parakeet TDT via sherpa-onnx) recognizer/model/ - Embedded model files (encoder/decoder/joiner ONNX + tokens.txt) simpleradio/ - SRS protocol client - synthesizer/speakers/ - Platform-specific TTS (macos.go, piper.go) + synthesizer/pocket/ - Pocket TTS speaker (sherpa-onnx) + synthesizer/pocket/model/ - TTS model download/verify (no CGO) + synthesizer/pocket/voice/ - Embedded default reference voice (no CGO) + synthesizer/speakers/ - Speaker interface + resampling helpers tacview/ - Telemetry parsing brevity/, parser/, composer/ - GCI command handling internal/ - Private packages - application/ - Platform detection & glue + application/ - Application glue controller/, radar/, conf/ - Core logic ``` **Architecture:** Players → SRS → simpleradio.Client → recognizer → parser → controller ← radar ← tacview ← DCS -controller → composer → synthesizer (platform-specific) → simpleradio.Client → SRS - -Platform-specific code isolated to `pkg/synthesizer/speakers/{macos,piper}.go` and Makefile. Runtime detection: `runtime.GOOS` ("darwin"/"windows"/"linux"). +controller → composer → synthesizer → simpleradio.Client → SRS ## Common Pitfalls diff --git a/Makefile b/Makefile index 087438cc..eedd4492 100644 --- a/Makefile +++ b/Makefile @@ -143,6 +143,10 @@ run: test: generate $(BUILD_VARS) $(GO) tool gotestsum -- $(BUILD_FLAGS) $(TEST_FLAGS) ./... +.PHONY: integration-test +integration-test: generate download-models + SKYEYE_MODELS_PATH=$(CURDIR)/models $(BUILD_VARS) $(GO) tool gotestsum -- -tags 'nolibopusfile integration' -ldflags '$(LDFLAGS)' -timeout 45m $(TEST_FLAGS) ./... + .PHONY: benchmark-parakeet benchmark-parakeet: $(BUILD_VARS) $(GO) test -bench=. -run BenchmarkParakeetRecognizer ./pkg/recognizer/parakeet diff --git a/cmd/download-models/main.go b/cmd/download-models/main.go index d1c429d6..738eca1a 100644 --- a/cmd/download-models/main.go +++ b/cmd/download-models/main.go @@ -1,4 +1,4 @@ -// download-models downloads Parakeet TDT model files for bundling into release archives. +// download-models downloads model files for bundling into release archives. // This tool has no CGO dependencies and can be built with CGO_ENABLED=0. package main @@ -10,17 +10,34 @@ import ( "os/signal" "path/filepath" - "github.com/dharmab/skyeye/pkg/recognizer/parakeet/model" + parakeetmodel "github.com/dharmab/skyeye/pkg/recognizer/parakeet/model" + pocketmodel "github.com/dharmab/skyeye/pkg/synthesizer/pocket/model" ) func main() { - dir := flag.String("dir", filepath.Join("models", model.DirName), "directory to download model files into") + dir := flag.String("dir", "models", "base directory to download model files into") flag.Parse() ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() - if err := model.Download(ctx, *dir); err != nil { - log.Fatal(err) + parakeetDir := filepath.Join(*dir, parakeetmodel.DirName) + if err := parakeetmodel.Verify(parakeetDir); err != nil { + log.Printf("Parakeet model needs download: %v", err) + if err := parakeetmodel.Download(ctx, parakeetDir); err != nil { + log.Fatal(err) + } + } else { + log.Println("Parakeet model already present and verified") + } + + pocketDir := filepath.Join(*dir, pocketmodel.DirName) + if err := pocketmodel.Verify(pocketDir); err != nil { + log.Printf("Pocket TTS model needs download: %v", err) + if err := pocketmodel.Download(ctx, pocketDir); err != nil { + log.Fatal(err) + } + } else { + log.Println("Pocket TTS model already present and verified") } } diff --git a/cmd/skyeye/main.go b/cmd/skyeye/main.go index 9c0df303..8555366d 100644 --- a/cmd/skyeye/main.go +++ b/cmd/skyeye/main.go @@ -9,8 +9,6 @@ import ( "os" "os/signal" "path/filepath" - "reflect" - "runtime" "runtime/pprof" "strings" "sync" @@ -29,7 +27,7 @@ import ( "github.com/dharmab/skyeye/internal/conf" "github.com/dharmab/skyeye/pkg/coalitions" parakeetmodel "github.com/dharmab/skyeye/pkg/recognizer/parakeet/model" - "github.com/dharmab/skyeye/pkg/synthesizer/voices" + pocketmodel "github.com/dharmab/skyeye/pkg/synthesizer/pocket/model" ) // Used for CLI configuration values. @@ -54,12 +52,9 @@ var ( coalitionName string telemetryUpdateInterval time.Duration recognizerLockPath string - voiceName string - useSystemVoice bool + voiceFile string mute bool - voiceSpeed float64 voiceVolume float64 - voicePauseLength time.Duration voiceLockPath string enableAutomaticPicture bool automaticPictureInterval time.Duration @@ -127,20 +122,10 @@ func init() { skyeye.Flags().StringVar(&recognizerLockPath, "recognizer-lock-path", "", "Path to lock file for concurrent speech-to-text when using multiple instances") // Text-to-speech - voiceFlag := cli.NewEnum(&voiceName, "Voice", "", "feminine", "masculine") - skyeye.Flags().Var(voiceFlag, "voice", "Voice to use for SRS transmissions (feminine, masculine). Automatically chosen if not provided.") - skyeye.Flags().Float64Var(&voiceSpeed, "voice-playback-speed", 1.0, "How quickly the GCI speaks (values below 1.0 are faster and above are slower).") + skyeye.Flags().StringVar(&voiceFile, "voice-file", "", "Path to WAV file for custom voice cloning. Uses built-in default if not set.") skyeye.Flags().Float64Var(&voiceVolume, "voice-volume", voiceVolumeDefault, fmt.Sprintf("Volume level for audio output (%v = silent, %v = normal)", voiceVolumeMin, voiceVolumeDefault)) skyeye.Flags().BoolVar(&mute, "mute", false, "Mute all SRS transmissions. Useful for testing without disrupting play") skyeye.Flags().StringVar(&voiceLockPath, "voice-lock-path", "", "Path to lock file for concurrent text-to-speech when using multiple instances") - if runtime.GOOS == "darwin" { - skyeye.Flags().BoolVar(&useSystemVoice, "use-system-voice", false, "Use the System Voice chosen in the Spoken Content page in System Settings instead of Samantha.") - if err := skyeye.Flags().MarkDeprecated("voice", "Select a voice in System Settings and use --use-system-voice instead."); err != nil { - log.Fatal().Err(err).Msg("failed to mark flag as deprecated") - } - } else { - skyeye.Flags().DurationVar(&voicePauseLength, "voice-playback-pause", 200*time.Millisecond, "How long the GCI pauses between sentences.") - } // Controller behavior skyeye.Flags().BoolVar(&enableAutomaticPicture, "auto-picture", true, "Enable automatic PICTURE broadcasts") @@ -258,22 +243,6 @@ func randomizer() (rando *rand.Rand) { return } -func loadVoice(rando *rand.Rand) (voice voices.Voice) { - options := map[string]voices.Voice{ - "feminine": voices.FeminineVoice, - "masculine": voices.MasculineVoice, - } - if voiceName == "" { - keys := reflect.ValueOf(options).MapKeys() - voice = options[keys[rando.IntN(len(keys))].String()] - log.Info().Type("voice", voice).Msg("randomly selected voice") - } else { - voice = options[voiceName] - log.Info().Type("voice", voice).Msg("selected voice") - } - return -} - func loadCallsign(rando *rand.Rand) (callsign string) { var options []string if controllerCallsign != "" { @@ -309,28 +278,59 @@ func loadVoiceVolume() float64 { return clamped } -func setupParakeetModel(ctx context.Context, parakeetDir string, downloadModels bool) { - log.Info().Msg("verifying Parakeet model files") - if err := parakeetmodel.Verify(parakeetDir); err != nil { - var corruptErr *parakeetmodel.CorruptFileError - if errors.As(err, &corruptErr) { - log.Fatal().Err(err).Msg("Parakeet model files on disk failed verification") - } - var notFoundErr *parakeetmodel.FileNotFoundError - if errors.As(err, ¬FoundErr) { - log.Warn().Err(err).Msg("Parakeet model files not found") - if downloadModels { - log.Info().Msg("downloading Parakeet model files") - if err := parakeetmodel.Download(ctx, parakeetDir); err != nil { - log.Fatal().Err(err).Msg("failed to download Parakeet model") - } - } else { - log.Fatal().Err(err).Msg("no Parakeet model files found") +// modelSetup holds the verify and download functions for a model, allowing +// setupModel to work with both Parakeet and Pocket TTS models. +type modelSetup struct { + name string + dir string + verify func(string) error + download func(context.Context, string) error +} + +func setupModel(ctx context.Context, m modelSetup, autoDownload bool) { + log.Info().Msgf("verifying %s model files", m.name) + err := m.verify(m.dir) + if err == nil { + log.Info().Msgf("%s model files verified", m.name) + return + } + + // Check for corrupt files first — these should not be silently re-downloaded. + if hasCorruptFile(err) { + log.Fatal().Err(err).Msgf("%s model files on disk failed verification", m.name) + } + + // Check for missing files. + if hasMissingFile(err) { + log.Warn().Err(err).Msgf("%s model files not found", m.name) + if autoDownload { + log.Info().Msgf("downloading %s model files", m.name) + if dlErr := m.download(ctx, m.dir); dlErr != nil { + log.Fatal().Err(dlErr).Msgf("failed to download %s model", m.name) } + return } - } else { - log.Info().Msg("Parakeet model files verified") + log.Fatal().Err(err).Msgf("no %s model files found", m.name) } + + // Unexpected error (e.g. permission denied). + log.Fatal().Err(err).Msgf("failed to verify %s model files", m.name) +} + +// hasCorruptFile checks whether err (possibly a joined error) contains a CorruptFileError +// from either the parakeet or pocket model packages. +func hasCorruptFile(err error) bool { + var parakeetCorrupt *parakeetmodel.CorruptFileError + var pocketCorrupt *pocketmodel.CorruptFileError + return errors.As(err, ¶keetCorrupt) || errors.As(err, &pocketCorrupt) +} + +// hasMissingFile checks whether err (possibly a joined error) contains a FileNotFoundError +// from either the parakeet or pocket model packages. +func hasMissingFile(err error) bool { + var parakeetNotFound *parakeetmodel.FileNotFoundError + var pocketNotFound *pocketmodel.FileNotFoundError + return errors.As(err, ¶keetNotFound) || errors.As(err, &pocketNotFound) } func preRun(cmd *cobra.Command, _ []string) error { @@ -371,12 +371,24 @@ func run(_ *cobra.Command, _ []string) { }() parakeetDir := filepath.Join(modelsPath, parakeetmodel.DirName) - setupParakeetModel(ctx, parakeetDir, downloadModels) + setupModel(ctx, modelSetup{ + name: "Parakeet", + dir: parakeetDir, + verify: parakeetmodel.Verify, + download: parakeetmodel.Download, + }, downloadModels) + + pocketDir := filepath.Join(modelsPath, pocketmodel.DirName) + setupModel(ctx, modelSetup{ + name: "Pocket TTS", + dir: pocketDir, + verify: pocketmodel.Verify, + download: pocketmodel.Download, + }, downloadModels) log.Info().Msg("loading configuration") coalition := loadCoalition() rando := randomizer() - voice := loadVoice(rando) callsign := loadCallsign(rando) parsedSRSFrequencies := cli.LoadFrequencies(srsFrequencies) voiceLock := loadLock(voiceLockPath) @@ -399,13 +411,10 @@ func run(_ *cobra.Command, _ []string) { Coalition: coalition, RadarSweepInterval: telemetryUpdateInterval, RecognizerLock: recognizerLock, - Voice: voice, - UseSystemVoice: useSystemVoice, + VoiceFile: voiceFile, VoiceLock: voiceLock, Mute: mute, - VoiceSpeed: voiceSpeed, Volume: volume, - VoicePauseLength: voicePauseLength, EnableAutomaticPicture: enableAutomaticPicture, PictureBroadcastInterval: automaticPictureInterval, EnableThreatMonitoring: enableThreatMonitoring, @@ -428,6 +437,7 @@ func run(_ *cobra.Command, _ []string) { if err != nil { log.Fatal().Err(err).Msg("failed to start application") } + defer app.Close() err = app.Run(ctx, cancel, &wg) if err != nil { log.Fatal().Err(err).Msg("application exited with error") diff --git a/docs/ADMIN.md b/docs/ADMIN.md index 86d13ce5..1fc155e7 100644 --- a/docs/ADMIN.md +++ b/docs/ADMIN.md @@ -124,66 +124,9 @@ SkyEye uses NVIDIA Parakeet TDT for speech recognition. The model is embedded in ## Speech Synthesis -### Windows and Linux +SkyEye uses Pocket TTS, a voice-cloning text-to-speech model, via sherpa-onnx. The TTS model files are automatically downloaded on first run (same as the Parakeet speech recognition model). -SkyEye bundles two AI generated voices on Windows and Linux: - -* An Irish English feminine voice, based on ["Jenny" by Dioco](https://github.com/dioco-group/jenny-tts-dataset) -* A British English masculine voice, based on [Alan Pope](https://popey.com/blog/) - -You can select between these voices using the `voice` configuration option. If you do not select a voice, the two voices are rotated based on the wall clock time when SkyEye is started. - -### macOS - -SkyEye uses AI generated voices built into macOS. - -By default, the "Samantha" voice is used. This is the version of Siri's voice from the iPhone 4s, iPhone 5 and iPhone 6, based on [Susan Bennett](https://susancbennett.com/). - -It is also possible to use one of the newer Siri voices. **I strongly recommend enabling one of the newer voices.**, because they provide excellent quality, nearly indistinguishable from a human voice. - -Not all Siri voices work equally well; many struggle to pronounce aviation terminology. I've manually validated a voice for each version of macOS: - -#### macOS 26 Tahoe - -On macOS 26 Tahoe, the best voice is **Siri Voice 2**. - -1. Open System Settings -2. Click on "Accessibility" -3. Click on "Siri" -4. If the system language is not English, set the system speech language to English -5. Next to "System Voice", click the "i" button -6. In the list of languages, make sure "English" is selected -7. Click on "Voice" -8. Scroll down to "Siri". -9. Download Siri Voice 2. -10. Click "Done" -11. Set the system voice to Siri Voice 2. - -#### macOS 15 Tahoe - -On macOS 15 Sequoia, the best voice is **Siri Voice 5**. - -1. Open System Settings -2. Click on "Accessibility" -3. Click on "Spoken Content" -4. If the system language is not English, set the system speech language to English -5. Next to "System Voice", click the "i" button -6. In the list of languages, make sure "English" is selected -7. Click on "Voice" -8. Scroll down to "Siri". -9. Download the English (United States) Siri Voice 5. -10. Click "Done" -11. Set the system voice to Siri Voice 5. - -#### Testing the System Voice - -To test your change, open Terminal and run this command: - -```sh -say "Hello! This should be read in the voice you chose." -``` - -Finally, to use the selected voice instead of Samantha, set SkyEye's `use-system-voice` configuration option to `true`. +By default, SkyEye uses an embedded reference voice. You can provide your own reference voice for voice cloning using the `voice-file` configuration option. The file must be a 16-bit PCM mono WAV file containing a few seconds of clear speech. ## Networking @@ -473,12 +416,6 @@ Configure SkyEye by editing the config file at `$(brew --prefix)/etc/skyeye/conf $EDITOR "$(brew --prefix)/etc/skyeye/config.yaml" ``` -It is strongly recommended to configure the system voice as documented in [Speech Synthesis section](#speech-synthesis), and configure SkyEye to use the system voice: - -```yaml -use-system-voice: true -``` - To start SkyEye, and automatically start it on login: ```sh diff --git a/go.mod b/go.mod index ae4b46ad..d1e9ff96 100644 --- a/go.mod +++ b/go.mod @@ -4,14 +4,10 @@ go 1.26.0 require ( github.com/DCS-gRPC/go-bindings v0.7.1 - github.com/amitybell/piper-asset v0.0.0-20231030194325-d36a29e3b1fd - github.com/amitybell/piper-voice-alan v0.0.0-20231118093148-059963c24dbd - github.com/amitybell/piper-voice-jenny v0.0.0-20231118093224-dcf0d49e46b7 github.com/bwmarrin/discordgo v0.28.1 github.com/dharmab/goacmi v1.0.3 github.com/dharmab/numwords v1.0.1 github.com/gammazero/deque v0.2.1 - github.com/go-audio/aiff v1.1.0 github.com/gofrs/flock v0.13.0 github.com/gopxl/beep/v2 v2.1.1 github.com/hbollon/go-edlib v1.6.0 @@ -19,7 +15,6 @@ require ( github.com/k2-fsa/sherpa-onnx-go v1.12.24 github.com/lithammer/shortuuid/v3 v3.0.7 github.com/martinlindhe/unit v0.0.0-20230420213220-4adfd7d0a0d6 - github.com/nabbl/piper v0.0.0-20240819160100-e51f2288a5c0 github.com/pasztorpisti/go-crc v1.0.0 github.com/paulmach/orb v0.11.1 github.com/proway2/go-igrf v0.5.1 @@ -53,7 +48,6 @@ require ( github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/MirrexOne/unqueryvet v1.5.4 // indirect github.com/OpenPeeDeeP/depguard/v2 v2.2.1 // indirect - github.com/adrg/xdg v0.4.0 // indirect github.com/alecthomas/chroma/v2 v2.23.1 // indirect github.com/alecthomas/go-check-sumtype v0.3.1 // indirect github.com/alexkohler/nakedret/v2 v2.0.6 // indirect @@ -61,8 +55,6 @@ require ( github.com/alfatraining/structtag v1.0.0 // indirect github.com/alingse/asasalint v0.0.11 // indirect github.com/alingse/nilnesserr v0.2.0 // indirect - github.com/amitybell/piper-bin-linux v0.0.0-20231118093037-92b3de178ad8 // indirect - github.com/amitybell/piper-bin-windows v0.0.0-20231118093113-cc2cef2f6b74 // indirect github.com/ashanbrown/forbidigo/v2 v2.3.0 // indirect github.com/ashanbrown/makezero/v2 v2.1.0 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect @@ -100,7 +92,6 @@ require ( github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fzipp/gocyclo v0.6.0 // indirect github.com/ghostiam/protogetter v0.3.20 // indirect - github.com/go-audio/audio v1.0.0 // indirect github.com/go-critic/go-critic v0.14.3 // indirect github.com/go-toolsmith/astcast v1.1.0 // indirect github.com/go-toolsmith/astcopy v1.1.0 // indirect @@ -150,7 +141,6 @@ require ( github.com/karamaru-alpha/copyloopvar v1.2.2 // indirect github.com/kisielk/errcheck v1.10.0 // indirect github.com/kkHAIKE/contextcheck v1.1.6 // indirect - github.com/klauspost/compress v1.17.3 // indirect github.com/kulti/thelper v0.7.1 // indirect github.com/kunwardeep/paralleltest v1.0.15 // indirect github.com/lasiar/canonicalheader v1.1.2 // indirect @@ -178,7 +168,6 @@ require ( github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/moricho/tparallel v0.3.2 // indirect github.com/muesli/termenv v0.16.0 // indirect - github.com/nabbl/piper-bin-macos v0.0.0-20240805085459-7f1b1df8c68d // indirect github.com/nakabonne/nestif v0.3.1 // indirect github.com/nishanths/exhaustive v0.12.0 // indirect github.com/nishanths/predeclared v0.2.2 // indirect diff --git a/go.sum b/go.sum index ca5df1ab..881d4a5e 100644 --- a/go.sum +++ b/go.sum @@ -71,8 +71,6 @@ github.com/MirrexOne/unqueryvet v1.5.4 h1:38QOxShO7JmMWT+eCdDMbcUgGCOeJphVkzzRgy github.com/MirrexOne/unqueryvet v1.5.4/go.mod h1:fs9Zq6eh1LRIhsDIsxf9PONVUjYdFHdtkHIgZdJnyPU= github.com/OpenPeeDeeP/depguard/v2 v2.2.1 h1:vckeWVESWp6Qog7UZSARNqfu/cZqvki8zsuj3piCMx4= github.com/OpenPeeDeeP/depguard/v2 v2.2.1/go.mod h1:q4DKzC4UcVaAvcfd41CZh0PWpGgzrVxUYBlgKNGquUo= -github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls= -github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E= github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8vS6K3D0= github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= github.com/alecthomas/chroma/v2 v2.23.1 h1:nv2AVZdTyClGbVQkIzlDm/rnhk1E9bU9nXwmZ/Vk/iY= @@ -96,16 +94,6 @@ github.com/alingse/asasalint v0.0.11 h1:SFwnQXJ49Kx/1GghOFz1XGqHYKp21Kq1nHad/0WQ github.com/alingse/asasalint v0.0.11/go.mod h1:nCaoMhw7a9kSJObvQyVzNTPBDbNpdocqrSP7t/cW5+I= github.com/alingse/nilnesserr v0.2.0 h1:raLem5KG7EFVb4UIDAXgrv3N2JIaffeKNtcEXkEWd/w= github.com/alingse/nilnesserr v0.2.0/go.mod h1:1xJPrXonEtX7wyTq8Dytns5P2hNzoWymVUIaKm4HNFg= -github.com/amitybell/piper-asset v0.0.0-20231030194325-d36a29e3b1fd h1:4MLHn2cCVhzhPLlPO6946h1S0yk3o7Ry1831DEa5EcE= -github.com/amitybell/piper-asset v0.0.0-20231030194325-d36a29e3b1fd/go.mod h1:MiDKnt4NenfcrsVxYAxQW0nu4zjFYQPjGzzLB5MvOz8= -github.com/amitybell/piper-bin-linux v0.0.0-20231118093037-92b3de178ad8 h1:ZZoEErHc7pMWVXe6sRr3FZud8lU9G3kbF2IhHB2647o= -github.com/amitybell/piper-bin-linux v0.0.0-20231118093037-92b3de178ad8/go.mod h1:dVR33O0l/AFgQNmZfywfgNZ6qlpCKPhLnn9UpeMMLdM= -github.com/amitybell/piper-bin-windows v0.0.0-20231118093113-cc2cef2f6b74 h1:T5hXX0Z2JaE5gtZ7LScjG0r0BmDk0+FWlzyZ2b1nboo= -github.com/amitybell/piper-bin-windows v0.0.0-20231118093113-cc2cef2f6b74/go.mod h1:5Ea0Pc0QdO8FeriIXcqZtHViM2fi589jtFubrjaAk6w= -github.com/amitybell/piper-voice-alan v0.0.0-20231118093148-059963c24dbd h1:DsXuiWSHsbBkVNL7cBAdXD95kNwrE0Ck05OasSeUZ4g= -github.com/amitybell/piper-voice-alan v0.0.0-20231118093148-059963c24dbd/go.mod h1:5ghO6mSctWNXfDoh3r46HQEMIcPr5DqE5TMYfp5hskY= -github.com/amitybell/piper-voice-jenny v0.0.0-20231118093224-dcf0d49e46b7 h1:GMYJcgP1OKBMBuQfP7r0aRk4PS0AaviHVTERtdt/e/o= -github.com/amitybell/piper-voice-jenny v0.0.0-20231118093224-dcf0d49e46b7/go.mod h1:eKG2Bo69QGTVKKKKApafZr+4v4zk40jYNijh0s8/PzU= github.com/ashanbrown/forbidigo/v2 v2.3.0 h1:OZZDOchCgsX5gvToVtEBoV2UWbFfI6RKQTir2UZzSxo= github.com/ashanbrown/forbidigo/v2 v2.3.0/go.mod h1:5p6VmsG5/1xx3E785W9fouMxIOkvY2rRV9nMdWadd6c= github.com/ashanbrown/makezero/v2 v2.1.0 h1:snuKYMbqosNokUKm+R6/+vOPs8yVAi46La7Ck6QYSaE= @@ -212,12 +200,6 @@ github.com/gammazero/deque v0.2.1 h1:qSdsbG6pgp6nL7A0+K/B7s12mcCY/5l5SIUpMOl+dC0 github.com/gammazero/deque v0.2.1/go.mod h1:LFroj8x4cMYCukHJDbxFCkT+r9AndaJnFMuZDV34tuU= github.com/ghostiam/protogetter v0.3.20 h1:oW7OPFit2FxZOpmMRPP9FffU4uUpfeE/rEdE1f+MzD0= github.com/ghostiam/protogetter v0.3.20/go.mod h1:FjIu5Yfs6FT391m+Fjp3fbAYJ6rkL/J6ySpZBfnODuI= -github.com/go-audio/aiff v1.1.0 h1:m2LYgu/2BarpF2yZnFPWtY3Tp41k0A4y51gDRZZsEuU= -github.com/go-audio/aiff v1.1.0/go.mod h1:sDik1muYvhPiccClfri0fv6U2fyH/dy4VRWmUz0cz9Q= -github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= -github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= -github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= -github.com/go-audio/wav v1.0.0/go.mod h1:3yoReyQOsiARkvPl3ERCi8JFjihzG6WhjYpZCf5zAWE= github.com/go-critic/go-critic v0.14.3 h1:5R1qH2iFeo4I/RJU8vTezdqs08Egi4u5p6vOESA0pog= github.com/go-critic/go-critic v0.14.3/go.mod h1:xwntfW6SYAd7h1OqDzmN6hBX/JxsEKl5up/Y2bsxgVQ= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= @@ -434,8 +416,6 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/kkHAIKE/contextcheck v1.1.6 h1:7HIyRcnyzxL9Lz06NGhiKvenXq7Zw6Q0UQu/ttjfJCE= github.com/kkHAIKE/contextcheck v1.1.6/go.mod h1:3dDbMRNBFaq8HFXWC1JyvDSPm43CmE6IuHam8Wr0rkg= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.17.3 h1:qkRjuerhUU1EmXLYGkSH6EZL+vPSxIrYjLNAK4slzwA= -github.com/klauspost/compress v1.17.3/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -488,7 +468,6 @@ github.com/matoous/godox v1.1.0 h1:W5mqwbyWrwZv6OQ5Z1a/DHGMOvXYCBP3+Ht7KMoJhq4= github.com/matoous/godox v1.1.0/go.mod h1:jgE/3fUXiTurkdHOLT5WEkThTSuE7yxHv5iWPa80afs= github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= -github.com/mattetti/audio v0.0.0-20180912171649-01576cde1f21/go.mod h1:LlQmBGkOuV/SKzEDXBPKauvN2UqCgzXO2XjecTGj40s= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= @@ -519,10 +498,6 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/nabbl/piper v0.0.0-20240819160100-e51f2288a5c0 h1:U9cqEmB3rVgMp0A8zTSP3TnlmnMPrbwWApEXObTBKCQ= -github.com/nabbl/piper v0.0.0-20240819160100-e51f2288a5c0/go.mod h1:bFBWV8PBQEC05ZPi5L4cBMBZQg5STeec4GmL6nvnTg4= -github.com/nabbl/piper-bin-macos v0.0.0-20240805085459-7f1b1df8c68d h1:I3pOmUaoFE8Lvf5j++9hc/UaXfW21zodWT1sihHY57M= -github.com/nabbl/piper-bin-macos v0.0.0-20240805085459-7f1b1df8c68d/go.mod h1:NIGeON0x6RckQptwA2jS7U89GcsjMXbzSBw5edGOw9A= github.com/nakabonne/nestif v0.3.1 h1:wm28nZjhQY5HyYPx+weN3Q65k6ilSBxDb8v5S81B81U= github.com/nakabonne/nestif v0.3.1/go.mod h1:9EtoZochLn5iUprVDmDjqGKPofoUEBL8U4Ngq6aY7OE= github.com/nishanths/exhaustive v0.12.0 h1:vIY9sALmw6T/yxiASewa4TQcFsVYZQQRUQJhKRf3Swg= @@ -664,7 +639,6 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= @@ -928,7 +902,6 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211019181941-9d821ace8654/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211105183446-c75c47738b0c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/application/app.go b/internal/application/app.go index b19d968c..3e011a65 100644 --- a/internal/application/app.go +++ b/internal/application/app.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "path/filepath" - "runtime" "sync" "time" @@ -26,7 +25,8 @@ import ( "github.com/dharmab/skyeye/pkg/sim" "github.com/dharmab/skyeye/pkg/simpleradio" srs "github.com/dharmab/skyeye/pkg/simpleradio/types" - "github.com/dharmab/skyeye/pkg/synthesizer/speakers" + "github.com/dharmab/skyeye/pkg/synthesizer/pocket" + pocketmodel "github.com/dharmab/skyeye/pkg/synthesizer/pocket/model" "github.com/dharmab/skyeye/pkg/telemetry" "github.com/dharmab/skyeye/pkg/traces" "github.com/gofrs/flock" @@ -59,7 +59,7 @@ type Application struct { // composer converts responses and calls from internal representations to English brevity text composer composer.Composer // speaker provides text-to-speech synthesis - speaker speakers.Speaker + speaker *pocket.Speaker // speakerLock prevents multiple instances from running the speaker at the same time speakerLock *flock.Flock // volume is the audio output volume level @@ -185,14 +185,14 @@ func NewApplication(config conf.Configuration) (*Application, error) { responseComposer := composer.Composer{Callsign: config.Callsign} log.Info().Msg("constructing text-to-speech synthesizer") - var synthesizer speakers.Speaker - if runtime.GOOS == "darwin" { - synthesizer = speakers.NewMacOSSpeaker(config.UseSystemVoice, config.VoiceSpeed) - } else { - synthesizer, err = speakers.NewPiperSpeaker(config.Voice, config.VoiceSpeed, config.VoicePauseLength) - if err != nil { - return nil, fmt.Errorf("failed to construct application: %w", err) - } + pocketDir := filepath.Join(config.ModelsPath, pocketmodel.DirName) + var pocketOpts []pocket.Option + if config.VoiceFile != "" { + pocketOpts = append(pocketOpts, pocket.WithVoiceFile(config.VoiceFile)) + } + synthesizer, err := pocket.New(pocketDir, pocketOpts...) + if err != nil { + return nil, fmt.Errorf("failed to construct application: %w", err) } tracers := make([]traces.Tracer, 0) @@ -234,6 +234,11 @@ func NewApplication(config conf.Configuration) (*Application, error) { return app, nil } +// Close releases resources held by the application. +func (a *Application) Close() { + a.speaker.Close() +} + // Run implements Application.Run. func (a *Application) Run(ctx context.Context, cancel context.CancelFunc, wg *sync.WaitGroup) error { wg.Go(func() { diff --git a/internal/conf/configuration.go b/internal/conf/configuration.go index dbd281c6..c4f3b2f4 100644 --- a/internal/conf/configuration.go +++ b/internal/conf/configuration.go @@ -5,7 +5,6 @@ import ( "github.com/dharmab/skyeye/pkg/coalitions" "github.com/dharmab/skyeye/pkg/simpleradio" - "github.com/dharmab/skyeye/pkg/synthesizer/voices" "github.com/gofrs/flock" "github.com/martinlindhe/unit" ) @@ -49,21 +48,15 @@ type Configuration struct { RadarSweepInterval time.Duration // RecognizerLock is a file-based lock to control multiple instances running the recognizer at the same time. RecognizerLock *flock.Flock - // Voice is the voice used for SRS transmissions - Voice voices.Voice - // UseSystemVoice controls whether to use the System Voice on macOS. This allows use of current Siri voices, - // but requires additional configuration in System Settings. - UseSystemVoice bool - // VoiceLock is a file-based lock to control multiple instances running Piper at the same time. + // VoiceFile is the path to a custom WAV file for voice cloning. + // If empty, the embedded default voice is used. + VoiceFile string + // VoiceLock is a file-based lock to control multiple instances running TTS at the same time. VoiceLock *flock.Flock // Mute disables SRS transmissions Mute bool - // Piper playback speed (default is 1.0) - The higher the value the slower it is. - VoiceSpeed float64 // Volume level for audio output (default is 1.0) Volume float64 - // Piper playback pause after every sentence in seconds (default is 0.2) - VoicePauseLength time.Duration // EnableAutomaticPicture controls whether the controller will automatically broadcast a PICTURE at regular intervals. EnableAutomaticPicture bool // PictureBroadcastInterval is the interval at which the controller will automatically broadcast a PICTURE. @@ -97,5 +90,3 @@ var DefaultCallsigns = []string{"Sky Eye", "Thunderhead", "Eagle Eye", "Ghost Ey var DefaultPictureRadius = 300 * unit.NauticalMile const DefaultMarginRadius = 3 * unit.NauticalMile - -var DefaultPlaybackSpeed = 1.0 diff --git a/pkg/parser/bogeydope_test.go b/pkg/parser/bogeydope_test.go index b8a755ac..c826647a 100644 --- a/pkg/parser/bogeydope_test.go +++ b/pkg/parser/bogeydope_test.go @@ -314,6 +314,36 @@ func TestParserBogeyDope(t *testing.T) { "BUCKDOP", "BOGGADOP", "Buck it up", + "BODIDODA", + "Bougie Dough", + "Bougie", + "Vogie Doe", + "Vogie", + "WAJIDOKE", + "Dody Dot", + "Ody Do", + "Bohy Dog", + "OG Doway", + "OG Dode Do", + "OG Dote", + "OG Doi", + "Bohi Day Doi D", + "Boti Doty", + "Bo G Doti", + "Mogi Dogie Dose", + "Ogie Doi Doge", + "Bobie Dogie Dogie", //nolint:dupword // intentional STT garble + "Boj Doy Dok", + "Vogee Dogie Dogie Dog", //nolint:dupword // intentional STT garble + "OG Day Dough", + "Budgie Doey Dog", + "Boyido De", + "Bodhi Doe", + "Bojy Dud", + "Moji Doti", + "Boy Do", + "Vojidoji", + "Vaughi Do", } for _, text := range simpleCases { tc := parserTestCase{ diff --git a/pkg/parser/callsign.go b/pkg/parser/callsign.go index f37fed3f..399a9955 100644 --- a/pkg/parser/callsign.go +++ b/pkg/parser/callsign.go @@ -11,6 +11,87 @@ const ( maxCallsignDigits = 3 ) +// digitHomophones maps common speech recognition misheard words to digits. +var digitHomophones = map[string]string{ + "won": "1", + "to": "2", + "too": "2", + "tu": "2", + "tutu": "22", + "free": "3", + "tree": "3", + "for": "4", + "fore": "4", + "ate": "8", + "niner": "9", +} + +// replaceDigitHomophones replaces words that are homophones of digits, +// but only when they appear in digit positions of a callsign (i.e., after +// the callsign name or mixed with actual digits). +func replaceDigitHomophones(tx string) string { + fields := strings.Fields(tx) + // Find the first field that is or looks like a digit. + // Everything before that is the callsign name. + firstDigitIdx := -1 + for i, f := range fields { + if hasDigits(f) || digitHomophones[f] != "" { + firstDigitIdx = i + break + } + } + if firstDigitIdx < 0 { + return tx + } + for i := firstDigitIdx; i < len(fields); i++ { + if d, ok := digitHomophones[fields[i]]; ok { + fields[i] = d + } + // Strip ordinal suffixes: "1st" → "1", "2nd" → "2", etc. + fields[i] = stripOrdinalSuffix(fields[i]) + } + return strings.Join(fields, " ") +} + +// stripOrdinalSuffix removes ordinal suffixes (st, nd, rd, th) from a +// string that starts with digits, e.g. "5th" → "5". +func stripOrdinalSuffix(s string) string { + for _, suffix := range []string{"st", "nd", "rd", "th"} { + if strings.HasSuffix(s, suffix) { + prefix := s[:len(s)-len(suffix)] + if prefix != "" && hasDigits(prefix) { + return prefix + } + } + } + return s +} + +// deduplicateConsecutiveWords removes consecutive duplicate words, +// e.g. "eagle eagle 2 7" → "eagle 2 7". This handles STT stutter +// where words are repeated. +func isDigitLike(s string) bool { + return hasDigits(s) || digitHomophones[s] != "" +} + +func deduplicateConsecutiveWords(tx string) string { + fields := strings.Fields(tx) + if len(fields) <= 1 { + return tx + } + result := []string{fields[0]} + for i := 1; i < len(fields); i++ { + // Only deduplicate words that are not digits or digit homophones. + // This collapses "eagle eagle" but preserves "won won" (→ "1 1") + // and "1 1". + if fields[i] == fields[i-1] && !isDigitLike(fields[i]) { + continue + } + result = append(result, fields[i]) + } + return strings.Join(result, " ") +} + // ParsePilotCallsign attempts to parse a callsign in one of the following formats: // - A single word, followed by a number consisting of any digits // - A number consisting of up to 3 digits @@ -22,17 +103,18 @@ func ParsePilotCallsign(tx string) (callsign string, isValid bool) { tx = removeClanTags(tx) tx = normalize(tx) tx = spaceDigits(tx) - for token, replacement := range map[string]string{ - "request": "", - "this is": "", - "want to": "12", - "tutu": "22", - "to 8": "28", - "free 1": "31", - } { - tx = strings.ReplaceAll(tx, token, replacement) + + // Discard "this is" prefix. + tx = strings.ReplaceAll(tx, "this is", "") + + // Truncate at "request" — anything after it is part of the request, not the callsign. + if idx := strings.Index(tx, "request"); idx >= 0 { + tx = tx[:idx] } + tx = deduplicateConsecutiveWords(tx) + tx = replaceDigitHomophones(tx) + var builder strings.Builder n := 0 for _, char := range tx { diff --git a/pkg/parser/callsign_test.go b/pkg/parser/callsign_test.go index e8812020..0c25cd3e 100644 --- a/pkg/parser/callsign_test.go +++ b/pkg/parser/callsign_test.go @@ -29,6 +29,71 @@ func TestParsePilotCallsign(t *testing.T) { {"Wolf 1 [CLAN]", "wolf 1"}, {"[CLAN] Wolf 1 [1SG]", "wolf 1"}, {"[Wolf 1", "wolf 1"}, + + // Homophones: "won" misheard for "one" + {"Eagle won won", "eagle 1 1"}, //nolint:dupword + {"Eagle won 1", "eagle 1 1"}, + {"Eagle 1 won", "eagle 1 1"}, + + // Homophones: "to"/"too"/"tu" misheard for "two" + {"Eagle to 1", "eagle 2 1"}, + {"Eagle too 1", "eagle 2 1"}, + {"Eagle 1 to", "eagle 1 2"}, + {"Eagle 1 too", "eagle 1 2"}, + {"Eagle to to", "eagle 2 2"}, //nolint:dupword + {"Eagle too too", "eagle 2 2"}, //nolint:dupword + + // Homophones: "for"/"fore" misheard for "four" + {"Eagle for 1", "eagle 4 1"}, + {"Eagle 1 for", "eagle 1 4"}, + {"Eagle for for", "eagle 4 4"}, //nolint:dupword + {"Eagle fore 1", "eagle 4 1"}, + + // Homophones: "free"/"tree" misheard for "three" + {"Eagle free 1", "eagle 3 1"}, + {"Eagle 1 free", "eagle 1 3"}, + {"Eagle tree 1", "eagle 3 1"}, + + // Homophones: "ate" misheard for "eight" + {"Eagle ate 1", "eagle 8 1"}, + {"Eagle 1 ate", "eagle 1 8"}, + + // Homophones: "niner" misheard for "nine" + {"Eagle niner 1", "eagle 9 1"}, + + // Ordinals misheard for digits + {"Eagle 1st", "eagle 1"}, + {"Eagle 2nd", "eagle 2"}, + {"Eagle 3rd", "eagle 3"}, + {"Eagle 4th", "eagle 4"}, + {"Eagle 5th", "eagle 5"}, + {"Eagle 6th", "eagle 6"}, + {"Eagle 7th", "eagle 7"}, + {"Eagle 8th", "eagle 8"}, + {"Eagle 9th", "eagle 9"}, + + // Mixed homophones: all digit combinations 1-9 x 1-9 + // using commonly misheard forms + {"Eagle won to", "eagle 1 2"}, + {"Eagle won free", "eagle 1 3"}, + {"Eagle won for", "eagle 1 4"}, + {"Eagle to free", "eagle 2 3"}, + {"Eagle to for", "eagle 2 4"}, + {"Eagle free to", "eagle 3 2"}, + {"Eagle free for", "eagle 3 4"}, + {"Eagle for to", "eagle 4 2"}, + {"Eagle for free", "eagle 4 3"}, + {"Eagle for ate", "eagle 4 8"}, + {"Eagle ate to", "eagle 8 2"}, + {"Eagle ate for", "eagle 8 4"}, + {"Eagle ate ate", "eagle 8 8"}, //nolint:dupword + + // Deduplicate repeated callsign name from STT stuttering + {"Eagle Eagle 2 7", "eagle 2 7"}, //nolint:dupword + {"Eagle eagle 2 7", "eagle 2 7"}, //nolint:dupword + {"Viper Viper 3 1", "viper 3 1"}, //nolint:dupword + {"Falcon falcon 1 2", "falcon 1 2"}, //nolint:dupword + {"Hornet Hornet 4 1", "hornet 4 1"}, //nolint:dupword } for _, test := range testCases { diff --git a/pkg/parser/replacements.go b/pkg/parser/replacements.go index cdb330c8..85618983 100644 --- a/pkg/parser/replacements.go +++ b/pkg/parser/replacements.go @@ -23,6 +23,7 @@ var replacementLUT = map[string]string{ "bloggedoop": bogeyDope, "bo i doke": bogeyDope, "boado": bogeyDope, + "bodidoda": bogeyDope, "boat be dope": bogeyDope, "bobbiedope": bogeyDope, "bobbitoop": bogeyDope, @@ -38,6 +39,12 @@ var replacementLUT = map[string]string{ "bobydo": bogeyDope, "bochy do": bogeyDope, "bochy": bogeyDope, + "bodhi": bogeyDope, + "bohi": bogeyDope, + "bohy": bogeyDope, + "bo g": bogeyDope, + "bobie": bogeyDope, + "boti": bogeyDope, "boedo": bogeyDope, "bog it": bogeyDope, "bogado": bogeyDope, @@ -87,7 +94,9 @@ var replacementLUT = map[string]string{ "boguie": bogeyDope, "bogy": bogeyDope, "boido": bogeyDope, + "boj do": bogeyDope, "bojedo": bogeyDope, + "bojy": bogeyDope, "boji": bogeyDope, "bojudo": bogeyDope, "boke it up": bogeyDope, @@ -102,6 +111,8 @@ var replacementLUT = map[string]string{ "bologito": bogeyDope, "boly dop": bogeyDope, "bombdo": bogeyDope, + "bougie dough": bogeyDope, + "bougie": bogeyDope, "booby doo": bogeyDope, "booby dop": bogeyDope, "boobydope": bogeyDope, @@ -124,12 +135,15 @@ var replacementLUT = map[string]string{ "book it up": bogeyDope, "booy dope": bogeyDope, "bop do": bogeyDope, + "boyido": bogeyDope, "bovido": bogeyDope, "bovito": bogeyDope, "bowido": bogeyDope, "bowie dope": bogeyDope, + "boy do": bogeyDope, "boy dope": bogeyDope, "boyadop": bogeyDope, + "budgie": bogeyDope, "bubby do": bogeyDope, "bubby dope": bogeyDope, "bubbydo": bogeyDope, @@ -194,6 +208,7 @@ var replacementLUT = map[string]string{ "comsjack": radioCheck, "cons check": radioCheck, "declared": declare, + "dody dot": bogeyDope, "doggy dope": bogeyDope, "dogito": bogeyDope, "dravia check": radioCheck, @@ -218,13 +233,19 @@ var replacementLUT = map[string]string{ "microphone check": radioCheck, "mike check": radioCheck, "mo ki dope": bogeyDope, + "mogi do": bogeyDope, + "moji": bogeyDope, "mogito": bogeyDope, "obey dope": bogeyDope, "odi": bogeyDope, + "og da": bogeyDope, + "og do": bogeyDope, "og dope": bogeyDope, + "ody do": bogeyDope, "ogedo": bogeyDope, "ogeydo": bogeyDope, "oggy do": bogeyDope, + "ogie": bogeyDope, "ogi do": bogeyDope, "ogi doke": bogeyDope, "ogi dop": bogeyDope, @@ -290,6 +311,10 @@ var replacementLUT = map[string]string{ "trip bar": tripwire, "trip wire": tripwire, "vog it up": bogeyDope, + "vaughi": bogeyDope, + "vogee": bogeyDope, + "vogie doe": bogeyDope, + "vogie": bogeyDope, "vogadope": bogeyDope, "voged hope": bogeyDope, "vogedope": bogeyDope, @@ -299,9 +324,11 @@ var replacementLUT = map[string]string{ "vogue it up": bogeyDope, "vogue": bogeyDope, "voguy": bogeyDope, + "voji": bogeyDope, "voki": bogeyDope, "votigo": bogeyDope, "warn me": tripwire, + "wajidoke": bogeyDope, "wiggidope": bogeyDope, "wogit up": bogeyDope, "wogitop": bogeyDope, diff --git a/pkg/radar/contacts.go b/pkg/radar/contacts.go index bdd77fbd..6b99182a 100644 --- a/pkg/radar/contacts.go +++ b/pkg/radar/contacts.go @@ -13,6 +13,10 @@ import ( "github.com/rs/zerolog/log" ) +// CallsignSimilarityThreshold is the minimum Levenshtein similarity score +// required for fuzzy callsign matching (0.0-1.0). +const CallsignSimilarityThreshold = 0.63 + // contactDatabase is a thread-safe trackfile contactDatabase. type contactDatabase struct { lock sync.RWMutex @@ -45,7 +49,7 @@ func (db *contactDatabase) getByCallsignAndCoalititon(callsign string, coalition } logger.Info().Msg("callsign not found in index, attempting fuzzy search") var err error - foundCallsign, err = fuzz.FuzzySearchThreshold(callsign, keys, 0.63, fuzz.Levenshtein) + foundCallsign, err = fuzz.FuzzySearchThreshold(callsign, keys, CallsignSimilarityThreshold, fuzz.Levenshtein) if foundCallsign == "" || err != nil { logger.Warn().Err(err).Msg("callsign not found in index") return "", nil, false diff --git a/pkg/recognizer/parakeet/model/model.go b/pkg/recognizer/parakeet/model/model.go index 5e01af53..7aa2de59 100644 --- a/pkg/recognizer/parakeet/model/model.go +++ b/pkg/recognizer/parakeet/model/model.go @@ -19,27 +19,6 @@ import ( "github.com/rs/zerolog/log" ) -// DirName is the subdirectory name used for the Parakeet model within a models directory. -const DirName = "parakeet" - -const modelURL = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2" - -// Filenames lists the filenames required for the Parakeet TDT model. -var Filenames = []string{ - "encoder.int8.onnx", - "decoder.int8.onnx", - "joiner.int8.onnx", - "tokens.txt", -} - -// fileHashes maps each model filename to its expected SHA256 hash. -var fileHashes = map[string]string{ //nolint:gosec // these are file integrity hashes, not credentials - "encoder.int8.onnx": "a32b12d17bbbc309d0686fbbcc2987b5e9b8333a7da83fa6b089f0a2acd651ab", - "decoder.int8.onnx": "b6bb64963457237b900e496ee9994b59294526439fbcc1fecf705b31a15c6b4e", - "joiner.int8.onnx": "7946164367946e7f9f29a122407c3252b680dbae9a51343eb2488d057c3c43d2", - "tokens.txt": "ec182b70dd42113aff6c5372c75cac58c952443eb22322f57bbd7f53977d497d", -} - // FileNotFoundError indicates that a required model file is missing. type FileNotFoundError struct { Path string @@ -99,8 +78,8 @@ func verifyFile(fpath string) error { return nil } -// Download downloads the Parakeet TDT model archive, extracts the required -// files into dir, and verifies their SHA256 hashes. +// Download downloads the Parakeet TDT model archive, verifies its SHA256 hash, +// extracts the required files into dir, and verifies their individual hashes. func Download(ctx context.Context, dir string) error { if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("failed to create directory: %w", err) @@ -122,12 +101,48 @@ func Download(ctx context.Context, dir string) error { return fmt.Errorf("failed to download model: HTTP %d", resp.StatusCode) } + tmpFile, err := os.CreateTemp("", "parakeet-model-*.tar.bz2") + if err != nil { + return fmt.Errorf("creating temp file: %w", err) + } + defer os.Remove(tmpFile.Name()) + defer tmpFile.Close() + + h := sha256.New() + if _, err := io.Copy(tmpFile, io.TeeReader(resp.Body, h)); err != nil { + return fmt.Errorf("downloading archive: %w", err) + } + + actual := hex.EncodeToString(h.Sum(nil)) + if actual != archiveHash { + return fmt.Errorf("archive hash mismatch: expected %s, got %s", archiveHash, actual) + } + log.Info().Msg("archive hash verified") + + if _, err := tmpFile.Seek(0, io.SeekStart); err != nil { + return fmt.Errorf("seeking temp file: %w", err) + } + + if err := extractArchive(tmpFile, dir); err != nil { + return err + } + + log.Info().Msg("verifying model file hashes") + if err := Verify(dir); err != nil { + return fmt.Errorf("model verification after download failed: %w", err) + } + + log.Info().Msg("model download complete") + return nil +} + +func extractArchive(r io.Reader, dir string) error { needed := make(map[string]bool, len(Filenames)) for _, f := range Filenames { needed[f] = true } - bzReader := bzip2.NewReader(resp.Body) + bzReader := bzip2.NewReader(r) tarReader := tar.NewReader(bzReader) extracted := 0 @@ -140,13 +155,11 @@ func Download(ctx context.Context, dir string) error { return fmt.Errorf("reading tar archive: %w", err) } - // The archive contains files under a top-level directory; extract only the base name. base := filepath.Base(header.Name) if !needed[base] { continue } - // Guard against path traversal. if strings.Contains(base, "..") { continue } @@ -162,13 +175,6 @@ func Download(ctx context.Context, dir string) error { if extracted != len(Filenames) { return fmt.Errorf("expected %d model files in archive, found %d", len(Filenames), extracted) } - - log.Info().Msg("verifying model file hashes") - if err := Verify(dir); err != nil { - return fmt.Errorf("model verification after download failed: %w", err) - } - - log.Info().Msg("model download complete") return nil } @@ -178,7 +184,7 @@ func extractTarEntry(dst string, r io.Reader) error { return fmt.Errorf("creating file %s: %w", dst, err) } defer f.Close() - if _, err := io.Copy(f, r); err != nil { //nolint:gosec // archive source is trusted + if _, err := io.Copy(f, r); err != nil { //nolint:gosec // archive hash verified before extraction return fmt.Errorf("writing file %s: %w", dst, err) } return nil diff --git a/pkg/recognizer/parakeet/model/version.go b/pkg/recognizer/parakeet/model/version.go new file mode 100644 index 00000000..25b17bb9 --- /dev/null +++ b/pkg/recognizer/parakeet/model/version.go @@ -0,0 +1,25 @@ +package model + +// DirName is the subdirectory name used for the Parakeet model within a models directory. +const DirName = "parakeet" + +const modelURL = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2" + +// archiveHash is the expected SHA256 hash of the downloaded tar.bz2 archive. +const archiveHash = "157c157bc51155e03e37d2466522a3a737dd9c72bb25f36eb18912964161e1ad" + +// Filenames lists the filenames required for the Parakeet TDT model. +var Filenames = []string{ + "encoder.int8.onnx", + "decoder.int8.onnx", + "joiner.int8.onnx", + "tokens.txt", +} + +// fileHashes maps each model filename to its expected SHA256 hash. +var fileHashes = map[string]string{ //nolint:gosec // these are file integrity hashes, not credentials + "encoder.int8.onnx": "a32b12d17bbbc309d0686fbbcc2987b5e9b8333a7da83fa6b089f0a2acd651ab", + "decoder.int8.onnx": "b6bb64963457237b900e496ee9994b59294526439fbcc1fecf705b31a15c6b4e", + "joiner.int8.onnx": "7946164367946e7f9f29a122407c3252b680dbae9a51343eb2488d057c3c43d2", + "tokens.txt": "ec182b70dd42113aff6c5372c75cac58c952443eb22322f57bbd7f53977d497d", +} diff --git a/pkg/synthesizer/pocket/model/model.go b/pkg/synthesizer/pocket/model/model.go new file mode 100644 index 00000000..c04dfd02 --- /dev/null +++ b/pkg/synthesizer/pocket/model/model.go @@ -0,0 +1,191 @@ +// Package model provides download and verification of Pocket TTS model files. +// This package has no CGO dependencies and can be built with CGO_ENABLED=0. +package model + +import ( + "archive/tar" + "compress/bzip2" + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strings" + + "github.com/rs/zerolog/log" +) + +// FileNotFoundError indicates that a required model file is missing. +type FileNotFoundError struct { + Path string + Err error +} + +func (e *FileNotFoundError) Error() string { + return "model file not found: " + e.Path +} + +func (e *FileNotFoundError) Unwrap() error { + return e.Err +} + +// CorruptFileError indicates that a model file exists but has an incorrect hash. +type CorruptFileError struct { + Path string + Expected string + Actual string +} + +func (e *CorruptFileError) Error() string { + return fmt.Sprintf("model file %s: hash mismatch (expected %s, got %s)", e.Path, e.Expected, e.Actual) +} + +// Verify checks that all model files exist in dir and match their expected SHA256 hashes. +// All files are checked and all errors are collected into a single joined error. +func Verify(dir string) error { + var errs []error + for _, name := range Filenames { + if err := verifyFile(filepath.Join(dir, name)); err != nil { + errs = append(errs, err) + } + } + return errors.Join(errs...) +} + +func verifyFile(fpath string) error { + f, err := os.Open(fpath) + if err != nil { + if os.IsNotExist(err) { + return &FileNotFoundError{Path: fpath, Err: err} + } + return fmt.Errorf("opening model file %s: %w", fpath, err) + } + defer f.Close() + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return fmt.Errorf("reading model file %s: %w", fpath, err) + } + actual := hex.EncodeToString(h.Sum(nil)) + basename := filepath.Base(fpath) + expected := fileHashes[basename] + if actual != expected { + return &CorruptFileError{Path: fpath, Expected: expected, Actual: actual} + } + return nil +} + +// Download downloads the Pocket TTS model archive, verifies its SHA256 hash, +// extracts the required files into dir, and verifies their individual hashes. +func Download(ctx context.Context, dir string) error { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("failed to create directory: %w", err) + } + + log.Info().Str("url", modelURL).Msg("downloading Pocket TTS model") + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, modelURL, nil) + if err != nil { + return fmt.Errorf("creating download request: %w", err) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("failed to download model: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to download model: HTTP %d", resp.StatusCode) + } + + tmpFile, err := os.CreateTemp("", "pocket-model-*.tar.bz2") + if err != nil { + return fmt.Errorf("creating temp file: %w", err) + } + defer os.Remove(tmpFile.Name()) + defer tmpFile.Close() + + h := sha256.New() + if _, err := io.Copy(tmpFile, io.TeeReader(resp.Body, h)); err != nil { + return fmt.Errorf("downloading archive: %w", err) + } + + actual := hex.EncodeToString(h.Sum(nil)) + if actual != archiveHash { + return fmt.Errorf("archive hash mismatch: expected %s, got %s", archiveHash, actual) + } + log.Info().Msg("archive hash verified") + + if _, err := tmpFile.Seek(0, io.SeekStart); err != nil { + return fmt.Errorf("seeking temp file: %w", err) + } + + if err := extractArchive(tmpFile, dir); err != nil { + return err + } + + log.Info().Msg("verifying model file hashes") + if err := Verify(dir); err != nil { + return fmt.Errorf("model verification after download failed: %w", err) + } + + log.Info().Msg("model download complete") + return nil +} + +func extractArchive(r io.Reader, dir string) error { + needed := make(map[string]bool, len(Filenames)) + for _, f := range Filenames { + needed[f] = true + } + + bzReader := bzip2.NewReader(r) + tarReader := tar.NewReader(bzReader) + + extracted := 0 + for { + header, err := tarReader.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return fmt.Errorf("reading tar archive: %w", err) + } + + base := filepath.Base(header.Name) + if !needed[base] { + continue + } + + if strings.Contains(base, "..") { + continue + } + + dst := filepath.Join(dir, base) + if err := extractTarEntry(dst, tarReader); err != nil { + return err + } + log.Info().Str("file", base).Msg("extracted model file") + extracted++ + } + + if extracted != len(Filenames) { + return fmt.Errorf("expected %d model files in archive, found %d", len(Filenames), extracted) + } + return nil +} + +func extractTarEntry(dst string, r io.Reader) error { + f, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) + if err != nil { + return fmt.Errorf("creating file %s: %w", dst, err) + } + defer f.Close() + if _, err := io.Copy(f, r); err != nil { //nolint:gosec // archive hash verified before extraction + return fmt.Errorf("writing file %s: %w", dst, err) + } + return nil +} diff --git a/pkg/synthesizer/pocket/model/version.go b/pkg/synthesizer/pocket/model/version.go new file mode 100644 index 00000000..bc0312a1 --- /dev/null +++ b/pkg/synthesizer/pocket/model/version.go @@ -0,0 +1,42 @@ +package model + +// DirName is the subdirectory name used for the Pocket TTS model within a models directory. +const DirName = "pocket" + +const modelURL = "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-pocket-tts-int8-2026-01-26.tar.bz2" + +// archiveHash is the expected SHA256 hash of the downloaded tar.bz2 archive. +const archiveHash = "2f3b88823cbbb9bf0b2477ec8ae7b3fec417b3a87b6bb5f256dba66f2ad967cb" + +// Model file names for Pocket TTS. +const ( + FilenameLmMain = "lm_main.int8.onnx" + FilenameLmFlow = "lm_flow.int8.onnx" + FilenameDecoder = "decoder.int8.onnx" + FilenameEncoder = "encoder.onnx" + FilenameTextConditioner = "text_conditioner.onnx" + FilenameVocabJSON = "vocab.json" + FilenameTokenScoresJSON = "token_scores.json" +) + +// Filenames lists the filenames required for the Pocket TTS model. +var Filenames = []string{ + FilenameLmMain, + FilenameLmFlow, + FilenameDecoder, + FilenameEncoder, + FilenameTextConditioner, + FilenameVocabJSON, + FilenameTokenScoresJSON, +} + +// fileHashes maps each model filename to its expected SHA256 hash. +var fileHashes = map[string]string{ //nolint:gosec // SHA256 hashes for model verification, not credentials + FilenameLmMain: "bfc0c7e7e3d72864fa3bb2ee499f62f21ddc1474b885f5f3ca570f8be73e787e", + FilenameLmFlow: "8d627d235c44a597da908e1085ebe241cbbe358964c502c5a5063d18851a5529", + FilenameDecoder: "12b0857402d31aead94df19d6783b4350d1f740e811f3a3202c70ad89ae11eea", + FilenameEncoder: "e8f2f6d301ffb96e398b138a7dc6d3038622d236044636b73d920bab85890260", + FilenameTextConditioner: "0b84e837d7bfaf2c896627b03e3f080320309f37f4fc7df7698c644f7ba5e6b1", + FilenameVocabJSON: "6fb646346cf931016f70c4921aab0900ce7a304b893cb02135c74e294abfea01", + FilenameTokenScoresJSON: "5be2f278caf9b9800741f0fd82bff677f4943ec764c356f907213434b622d958", +} diff --git a/pkg/synthesizer/pocket/pocket.go b/pkg/synthesizer/pocket/pocket.go new file mode 100644 index 00000000..838451ae --- /dev/null +++ b/pkg/synthesizer/pocket/pocket.go @@ -0,0 +1,160 @@ +// Package pocket provides a text-to-speech speaker using Pocket TTS via sherpa-onnx. +package pocket + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + + "github.com/dharmab/skyeye/pkg/synthesizer/pocket/model" + "github.com/dharmab/skyeye/pkg/synthesizer/pocket/voice" + "github.com/dharmab/skyeye/pkg/synthesizer/speakers" + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + "github.com/martinlindhe/unit" + "github.com/rs/zerolog/log" +) + +type options struct { + voiceFile string + numSteps int +} + +// Option configures Speaker behavior. +type Option func(*options) + +// WithVoiceFile sets a custom WAV file for voice cloning reference audio. +// The file must be 16-bit PCM mono WAV. If the file cannot be read, +// the embedded default voice is used instead. +func WithVoiceFile(path string) Option { + return func(o *options) { + o.voiceFile = path + } +} + +// WithNumSteps sets the number of inference steps (default 10). +func WithNumSteps(n int) Option { + return func(o *options) { + o.numSteps = n + } +} + +// Speaker implements speakers.Speaker using Pocket TTS. +type Speaker struct { + tts *sherpa.OfflineTts + genConfig sherpa.GenerationConfig +} + +var _ speakers.Speaker = (*Speaker)(nil) + +// New creates a Speaker. modelDir must contain the Pocket TTS model files. +func New(modelDir string, opts ...Option) (*Speaker, error) { + o := &options{ + numSteps: 10, + } + for _, opt := range opts { + opt(o) + } + + config := sherpa.OfflineTtsConfig{ + Model: sherpa.OfflineTtsModelConfig{ + Pocket: sherpa.OfflineTtsPocketModelConfig{ + LmMain: filepath.Join(modelDir, model.FilenameLmMain), + LmFlow: filepath.Join(modelDir, model.FilenameLmFlow), + Decoder: filepath.Join(modelDir, model.FilenameDecoder), + Encoder: filepath.Join(modelDir, model.FilenameEncoder), + TextConditioner: filepath.Join(modelDir, model.FilenameTextConditioner), + VocabJson: filepath.Join(modelDir, model.FilenameVocabJSON), + TokenScoresJson: filepath.Join(modelDir, model.FilenameTokenScoresJSON), + }, + NumThreads: 1, + Debug: 0, + }, + } + + tts := sherpa.NewOfflineTts(&config) + if tts == nil { + return nil, errors.New("failed to create Pocket TTS from model files") + } + + refAudio, refRate, err := loadReferenceAudio(o.voiceFile) + if err != nil { + return nil, fmt.Errorf("loading reference audio: %w", err) + } + + genConfig := sherpa.GenerationConfig{ + ReferenceAudio: refAudio, + ReferenceSampleRate: refRate, + NumSteps: o.numSteps, + Speed: 1.0, + } + + return &Speaker{ + tts: tts, + genConfig: genConfig, + }, nil +} + +// Say implements speakers.Speaker. +func (s *Speaker) Say(ctx context.Context, text string) ([]float32, error) { + cb := func(_ []float32, _ float32) bool { + select { + case <-ctx.Done(): + return false + default: + return true + } + } + + audio := s.tts.GenerateWithConfig(text, &s.genConfig, cb) + if audio == nil { + if ctx.Err() != nil { + return nil, ctx.Err() + } + return nil, errors.New("pocket TTS generation returned nil") + } + + sourceRate := unit.Frequency(audio.SampleRate) * unit.Hertz + resampled, err := speakers.DownsampleF32(audio.Samples, sourceRate) + if err != nil { + return nil, fmt.Errorf("resampling pocket TTS output: %w", err) + } + + return resampled, nil +} + +// Close releases C resources held by the TTS engine. +func (s *Speaker) Close() { + sherpa.DeleteOfflineTts(s.tts) +} + +func loadReferenceAudio(voiceFile string) ([]float32, int, error) { + if voiceFile != "" { + samples, sampleRate, err := loadVoiceFile(voiceFile) + if err == nil { + return samples, sampleRate, nil + } + log.Warn().Err(err).Str("path", voiceFile).Msg("failed to load voice file, falling back to default") + } + + samples, sampleRate, err := voice.DecodeWAV(voice.DefaultVoice) + if err != nil { + return nil, 0, fmt.Errorf("decoding embedded default voice: %w", err) + } + log.Info().Msg("using default reference voice") + return samples, sampleRate, nil +} + +func loadVoiceFile(path string) ([]float32, int, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, 0, fmt.Errorf("reading voice file: %w", err) + } + samples, sampleRate, err := voice.DecodeWAV(data) + if err != nil { + return nil, 0, fmt.Errorf("decoding voice file: %w", err) + } + log.Info().Str("path", path).Msg("using custom reference voice") + return samples, sampleRate, nil +} diff --git a/pkg/synthesizer/pocket/pocket_test.go b/pkg/synthesizer/pocket/pocket_test.go new file mode 100644 index 00000000..41980843 --- /dev/null +++ b/pkg/synthesizer/pocket/pocket_test.go @@ -0,0 +1,429 @@ +//go:build integration + +package pocket_test + +import ( + "context" + "fmt" + "math/rand/v2" + "os" + "path/filepath" + "runtime" + "strings" + "sync" + "testing" + + "github.com/dharmab/skyeye/pkg/bearings" + "github.com/dharmab/skyeye/pkg/brevity" + "github.com/dharmab/skyeye/pkg/parser" + "github.com/dharmab/skyeye/pkg/radar" + "github.com/dharmab/skyeye/pkg/recognizer/parakeet" + parakeetmodel "github.com/dharmab/skyeye/pkg/recognizer/parakeet/model" + "github.com/dharmab/skyeye/pkg/synthesizer/pocket" + pocketmodel "github.com/dharmab/skyeye/pkg/synthesizer/pocket/model" + fuzz "github.com/hbollon/go-edlib" + "github.com/martinlindhe/unit" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const gciCallsign = "Magic" + +// callsignCandidates returns all "word X Y" callsigns from 1 1 to 9 9 +// for the given callsign word, simulating what would exist in the radar database. +func callsignCandidates(word string) []string { + candidates := make([]string, 0, 81) + for f := 1; f <= 9; f++ { + for s := 1; s <= 9; s++ { + candidates = append(candidates, fmt.Sprintf("%s %d %d", word, f, s)) + } + } + return candidates +} + +// newPipeline sets up the TTS speaker, STT recognizer, and parser for integration tests. +// It skips the test if model files are not available. +func newPipeline(t *testing.T) (*pocket.Speaker, *parser.Parser, func(string) string) { + t.Helper() + + modelsPath := os.Getenv("SKYEYE_MODELS_PATH") + if modelsPath == "" { + modelsPath = "models" + } + + pocketDir := filepath.Join(modelsPath, pocketmodel.DirName) + require.NoError(t, pocketmodel.Verify(pocketDir), "Pocket TTS model files must be present") + + parakeetDir := filepath.Join(modelsPath, parakeetmodel.DirName) + require.NoError(t, parakeetmodel.Verify(parakeetDir), "Parakeet model files must be present") + + speaker, err := pocket.New(pocketDir) + require.NoError(t, err) + + rec, err := parakeet.NewRecognizer(parakeetDir) + require.NoError(t, err) + + p := parser.New(gciCallsign, true) + + // synthesizeAndRecognize runs TTS→STT and returns the recognized text. + synthesizeAndRecognize := func(text string) string { + t.Helper() + audio, err := speaker.Say(context.Background(), text) + require.NoError(t, err) + require.NotEmpty(t, audio) + + recognized, err := rec.Recognize(context.Background(), audio, false) + require.NoError(t, err) + t.Logf("Input: %q", text) + t.Logf("Recognized: %q", recognized) + return recognized + } + + return speaker, p, synthesizeAndRecognize +} + +func TestRoundTripRadioCheck(t *testing.T) { + t.Parallel() + speaker, p, recognize := newPipeline(t) + defer speaker.Close() + + recognized := recognize("Magic, Falcon 2 1, radio check") + request := p.Parse(recognized) + require.IsType(t, &brevity.RadioCheckRequest{}, request) + actual := request.(*brevity.RadioCheckRequest) + assert.Equal(t, "falcon 2 1", actual.Callsign) +} + +func TestRoundTripAlphaCheck(t *testing.T) { + t.Parallel() + speaker, p, recognize := newPipeline(t) + defer speaker.Close() + + recognized := recognize("Magic, Viper 3 1, alpha check") + request := p.Parse(recognized) + require.IsType(t, &brevity.AlphaCheckRequest{}, request) + actual := request.(*brevity.AlphaCheckRequest) + assert.Equal(t, "viper 3 1", actual.Callsign) +} + +func TestRoundTripBogeyDope(t *testing.T) { + t.Parallel() + speaker, p, recognize := newPipeline(t) + defer speaker.Close() + + recognized := recognize("Magic, Hornet 4 1, bogey dope") + request := p.Parse(recognized) + require.IsType(t, &brevity.BogeyDopeRequest{}, request) + actual := request.(*brevity.BogeyDopeRequest) + assert.Equal(t, "hornet 4 1", actual.Callsign) + assert.Equal(t, brevity.Aircraft, actual.Filter) +} + +func TestRoundTripPicture(t *testing.T) { + t.Parallel() + speaker, p, recognize := newPipeline(t) + defer speaker.Close() + + recognized := recognize("Magic, Eagle 2 1, picture") + request := p.Parse(recognized) + require.IsType(t, &brevity.PictureRequest{}, request) + actual := request.(*brevity.PictureRequest) + // Snap callsign using edit distance against a multi-flight candidate list, + // mirroring the real radar database. + var candidates []string + for _, w := range []string{"eagle", "mobius", "wardog"} { + candidates = append(candidates, callsignCandidates(w)...) + } + snapped, err := fuzz.FuzzySearchThreshold(actual.Callsign, candidates, radar.CallsignSimilarityThreshold, fuzz.Levenshtein) + require.NoError(t, err) + assert.Equal(t, "eagle 2 1", snapped, "callsign=%q did not snap to eagle 2 1", actual.Callsign) +} + +func TestRoundTripSpiked(t *testing.T) { + t.Parallel() + speaker, p, recognize := newPipeline(t) + defer speaker.Close() + + recognized := recognize("Magic, Cobra 3 1, spiked, one eight zero") + request := p.Parse(recognized) + require.IsType(t, &brevity.SpikedRequest{}, request) + actual := request.(*brevity.SpikedRequest) + assert.Equal(t, "cobra 3 1", actual.Callsign) + assert.Equal(t, bearings.NewMagneticBearing(180*unit.Degree), actual.Bearing) +} + +// TestRoundTripCallsignNumbers tests TTS→STT→parser round trips across many +// callsign words, number combinations, and request phrasings. Since TTS→STT +// is inherently lossy, individual permutations may fail — the test uses a +// probabilistic approach and requires an overall success rate above 99%. +func TestRoundTripCallsignNumbers(t *testing.T) { + t.Parallel() + + callsignWords := []string{"Eagle", "Mobius", "Wardog"} + requestPhrases := []string{"bogey dope", "request bogey dope"} + + // Build a combined candidate list with all callsign words, simulating + // a mission with multiple flights in the radar database. + var allCandidates []string + for _, word := range callsignWords { + allCandidates = append(allCandidates, callsignCandidates(strings.ToLower(word))...) + } + + type callsign struct { + word string + first, second int + } + + // Build pools of callsigns: common ones (flights 1-4) and all possible. + var commonCallsigns, allCallsigns []callsign + for _, word := range callsignWords { + for first := 1; first <= 9; first++ { + for second := 1; second <= 9; second++ { + cs := callsign{word, first, second} + allCallsigns = append(allCallsigns, cs) + if first >= 1 && first <= 4 && second >= 1 && second <= 4 { + commonCallsigns = append(commonCallsigns, cs) + } + } + } + } + + // Select 40 callsigns: 20 from common (1 1 through 1 4), 20 from entire set. + const numCommon, numRandom = 20, 20 + rand.Shuffle(len(commonCallsigns), func(i, j int) { + commonCallsigns[i], commonCallsigns[j] = commonCallsigns[j], commonCallsigns[i] + }) + rand.Shuffle(len(allCallsigns), func(i, j int) { + allCallsigns[i], allCallsigns[j] = allCallsigns[j], allCallsigns[i] + }) + + selected := make(map[callsign]struct{}) + for _, cs := range commonCallsigns { + if len(selected) >= numCommon { + break + } + selected[cs] = struct{}{} + } + for _, cs := range allCallsigns { + if len(selected) >= numCommon+numRandom { + break + } + selected[cs] = struct{}{} + } + + type testInput struct { + input string + expectedCallsign string + } + + // Build unique test inputs from selected callsigns × request phrases. + var unique []testInput + for cs := range selected { + expectedCallsign := fmt.Sprintf("%s %d %d", strings.ToLower(cs.word), cs.first, cs.second) + for _, phrase := range requestPhrases { + unique = append(unique, testInput{ + input: fmt.Sprintf("Magic, %s %d %d, %s", cs.word, cs.first, cs.second, phrase), + expectedCallsign: expectedCallsign, + }) + } + } + + // Repeat test inputs to reach a minimum count for statistical sensitivity. + // TTS is nondeterministic, so repeating the same input tests different + // audio renderings of the same phrase. + const minTests = 500 + var inputs []testInput + for len(inputs) < minTests { + inputs = append(inputs, unique...) + } + + type result struct { + input string + recognized string + success bool + detail string + } + + // Create a worker pool of TTS→STT pipelines. + numWorkers := max(runtime.NumCPU()/2, 1) + t.Logf("Using %d workers for %d test inputs", numWorkers, len(inputs)) + + results := make([]result, len(inputs)) + work := make(chan int, len(inputs)) + for i := range inputs { + work <- i + } + close(work) + + var wg sync.WaitGroup + for w := range numWorkers { + wg.Add(1) + go func() { + defer wg.Done() + + // Each worker gets its own pipeline (TTS + STT are not thread-safe). + modelsPath := os.Getenv("SKYEYE_MODELS_PATH") + if modelsPath == "" { + modelsPath = "models" + } + + pocketDir := filepath.Join(modelsPath, pocketmodel.DirName) + if err := pocketmodel.Verify(pocketDir); err != nil { + t.Errorf("worker %d: pocket model not available: %v", w, err) + return + } + + speaker, err := pocket.New(pocketDir) + if err != nil { + t.Errorf("worker %d: failed to create speaker: %v", w, err) + return + } + defer speaker.Close() + + parakeetDir := filepath.Join(modelsPath, parakeetmodel.DirName) + rec, err := parakeet.NewRecognizer(parakeetDir) + if err != nil { + t.Errorf("worker %d: failed to create recognizer: %v", w, err) + return + } + + p := parser.New(gciCallsign, true) + + for idx := range work { + ti := inputs[idx] + audio, err := speaker.Say(context.Background(), ti.input) + if err != nil { + results[idx] = result{input: ti.input, detail: fmt.Sprintf("TTS failed: %v", err)} + continue + } + + recognized, err := rec.Recognize(context.Background(), audio, false) + if err != nil { + results[idx] = result{input: ti.input, detail: fmt.Sprintf("STT failed: %v", err)} + continue + } + t.Logf("Input: %q", ti.input) + t.Logf("Recognized: %q", recognized) + + request := p.Parse(recognized) + r := result{input: ti.input, recognized: recognized} + + if request == nil { + r.detail = "parse returned nil" + results[idx] = r + continue + } + + bogeyDope, ok := request.(*brevity.BogeyDopeRequest) + if !ok { + r.detail = fmt.Sprintf("wrong type: %T", request) + results[idx] = r + continue + } + + snapped, err := fuzz.FuzzySearchThreshold( + bogeyDope.Callsign, allCandidates, + radar.CallsignSimilarityThreshold, fuzz.Levenshtein, + ) + if err != nil || snapped == "" { + r.detail = fmt.Sprintf("callsign %q did not snap to any candidate", bogeyDope.Callsign) + results[idx] = r + continue + } + + if snapped != ti.expectedCallsign { + r.detail = fmt.Sprintf("callsign %q snapped to %q, expected %q", bogeyDope.Callsign, snapped, ti.expectedCallsign) + results[idx] = r + continue + } + + r.success = true + results[idx] = r + } + }() + } + wg.Wait() + + total := len(results) + failures := 0 + for _, r := range results { + if !r.success { + failures++ + t.Logf("FAIL: input=%q recognized=%q reason=%s", r.input, r.recognized, r.detail) + } + } + successRate := float64(total-failures) / float64(total) + t.Logf("Results: %d/%d passed (%.1f%% success rate)", total-failures, total, successRate*100) + if successRate < 0.99 { + t.Errorf("Success rate %.1f%% is below 99%% threshold (%d failures out of %d tests)", successRate*100, failures, total) + } +} + +// FuzzRoundTrip verifies the TTS→STT→parser pipeline does not panic on arbitrary input. +// It synthesizes fuzz-generated text, recognizes it, and parses the result. +// The test passes as long as no step panics — the parser is expected to return nil for nonsense input. +func FuzzRoundTrip(f *testing.F) { + modelsPath := os.Getenv("SKYEYE_MODELS_PATH") + if modelsPath == "" { + modelsPath = "models" + } + + pocketDir := filepath.Join(modelsPath, pocketmodel.DirName) + if err := pocketmodel.Verify(pocketDir); err != nil { + f.Skipf("Pocket TTS model not available: %v", err) + } + + parakeetDir := filepath.Join(modelsPath, parakeetmodel.DirName) + if err := parakeetmodel.Verify(parakeetDir); err != nil { + f.Skipf("Parakeet model not available: %v", err) + } + + speaker, err := pocket.New(pocketDir) + require.NoError(f, err) + defer speaker.Close() + + rec, err := parakeet.NewRecognizer(parakeetDir) + require.NoError(f, err) + + p := parser.New(gciCallsign, false) + + // Seed corpus with realistic GCI requests. + seeds := []string{ + "Magic, Falcon 2 1, radio check", + "Magic, Viper 3 1, alpha check", + "Magic, Hornet 4 1, bogey dope", + "Magic, Eagle 2 1, picture", + "Magic, Cobra 3 1, spiked, one eight zero", + "Magic, Raptor 1 2, bogey dope fighters", + "Magic, Thunder 5 1, declare, bullseye zero nine zero, forty, twenty thousand", + "Hello world, this is a test of the text to speech system", + "Anyface, Mobius 1, radio check", + } + for _, s := range seeds { + f.Add(s) + } + + ctx := context.Background() + f.Fuzz(func(t *testing.T, input string) { + if len(input) == 0 || len(input) > 200 { + t.Skip() + } + + audio, err := speaker.Say(ctx, input) + if err != nil { + // TTS may legitimately fail on bizarre input; that's fine. + t.Skipf("TTS failed: %v", err) + } + if len(audio) == 0 { + t.Skip("TTS produced empty audio") + } + + recognized, err := rec.Recognize(ctx, audio, false) + if err != nil { + t.Skipf("STT failed: %v", err) + } + + // Parser should never panic regardless of input. + _ = p.Parse(recognized) + }) +} diff --git a/pkg/synthesizer/pocket/voice/default.wav b/pkg/synthesizer/pocket/voice/default.wav new file mode 100644 index 00000000..038c5f2e Binary files /dev/null and b/pkg/synthesizer/pocket/voice/default.wav differ diff --git a/pkg/synthesizer/pocket/voice/voice.go b/pkg/synthesizer/pocket/voice/voice.go new file mode 100644 index 00000000..903fdec7 --- /dev/null +++ b/pkg/synthesizer/pocket/voice/voice.go @@ -0,0 +1,94 @@ +// Package voice provides the default reference audio for Pocket TTS voice cloning. +// This package has no CGO dependencies and can be built with CGO_ENABLED=0. +package voice + +import ( + "encoding/binary" + "errors" + "fmt" + "math" + + _ "embed" +) + +// DefaultVoice is the embedded default reference WAV file for voice cloning. +// +//go:embed default.wav +var DefaultVoice []byte + +// DecodeWAV decodes a 16-bit PCM mono WAV file into float32 samples and its sample rate. +// The input must be a valid WAV file with 16-bit signed PCM encoding and exactly 1 channel. +func DecodeWAV(data []byte) (samples []float32, sampleRate int, err error) { + if len(data) < 44 { + return nil, 0, errors.New("WAV data too short for header") + } + + // Verify RIFF header + if string(data[0:4]) != "RIFF" { + return nil, 0, errors.New("missing RIFF header") + } + if string(data[8:12]) != "WAVE" { + return nil, 0, errors.New("missing WAVE format identifier") + } + + // Find fmt chunk + offset := 12 + var fmtFound bool + var audioFormat uint16 + var numChannels uint16 + var bitsPerSample uint16 + + for offset+8 <= len(data) { + chunkID := string(data[offset : offset+4]) + chunkSize := int(binary.LittleEndian.Uint32(data[offset+4 : offset+8])) + + if chunkID == "fmt " { + if chunkSize < 16 || offset+24 > len(data) { + return nil, 0, errors.New("fmt chunk too small") + } + audioFormat = binary.LittleEndian.Uint16(data[offset+8 : offset+10]) + numChannels = binary.LittleEndian.Uint16(data[offset+10 : offset+12]) + sampleRate = int(binary.LittleEndian.Uint32(data[offset+12 : offset+16])) + bitsPerSample = binary.LittleEndian.Uint16(data[offset+22 : offset+24]) + fmtFound = true + } + + if chunkID == "data" { + if !fmtFound { + return nil, 0, errors.New("data chunk before fmt chunk") + } + if audioFormat != 1 { + return nil, 0, fmt.Errorf("unsupported audio format %d (expected 1 = PCM)", audioFormat) + } + if numChannels != 1 { + return nil, 0, fmt.Errorf("unsupported channel count %d (expected 1 = mono)", numChannels) + } + if bitsPerSample != 16 { + return nil, 0, fmt.Errorf("unsupported bits per sample %d (expected 16)", bitsPerSample) + } + + dataStart := offset + 8 + dataEnd := min(dataStart+chunkSize, len(data)) + pcmData := data[dataStart:dataEnd] + + numSamples := len(pcmData) / 2 + samples = make([]float32, numSamples) + for i := range numSamples { + s := int16(binary.LittleEndian.Uint16(pcmData[i*2 : i*2+2])) //nolint:gosec // reinterpreting 16-bit PCM samples + samples[i] = float32(s) / math.MaxInt16 + } + return samples, sampleRate, nil + } + + // Advance to next chunk (chunks are word-aligned) + offset += 8 + chunkSize + if chunkSize%2 != 0 { + offset++ + } + } + + if !fmtFound { + return nil, 0, errors.New("fmt chunk not found") + } + return nil, 0, errors.New("data chunk not found") +} diff --git a/pkg/synthesizer/pocket/voice/voice_test.go b/pkg/synthesizer/pocket/voice/voice_test.go new file mode 100644 index 00000000..6cd0649a --- /dev/null +++ b/pkg/synthesizer/pocket/voice/voice_test.go @@ -0,0 +1,36 @@ +package voice + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDecodeWAV_DefaultVoice(t *testing.T) { + t.Parallel() + samples, sampleRate, err := DecodeWAV(DefaultVoice) + require.NoError(t, err) + assert.Equal(t, 16000, sampleRate) + assert.NotEmpty(t, samples) + + // Verify samples are in valid range [-1, 1] + for i, s := range samples { + if s < -1.0 || s > 1.0 { + t.Errorf("sample %d out of range: %f", i, s) + break + } + } +} + +func TestDecodeWAV_InvalidData(t *testing.T) { + t.Parallel() + _, _, err := DecodeWAV([]byte("not a wav file")) + assert.Error(t, err) +} + +func TestDecodeWAV_TooShort(t *testing.T) { + t.Parallel() + _, _, err := DecodeWAV([]byte{0, 1, 2}) + assert.Error(t, err) +} diff --git a/pkg/synthesizer/speakers/macos.go b/pkg/synthesizer/speakers/macos.go deleted file mode 100644 index 7dd3e38c..00000000 --- a/pkg/synthesizer/speakers/macos.go +++ /dev/null @@ -1,88 +0,0 @@ -package speakers - -import ( - "context" - "fmt" - "os" - "os/exec" - - "github.com/dharmab/skyeye/internal/conf" - "github.com/dharmab/skyeye/pkg/pcm" - "github.com/go-audio/aiff" - "github.com/martinlindhe/unit" -) - -type macOSSynth struct { - rate *unit.Frequency - voice string -} - -var _ Speaker = (*macOSSynth)(nil) - -// NewMacOSSpeaker creates a Speaker powered by Apple's Speech Synthesis Manager. -func NewMacOSSpeaker(useSystemVoice bool, playbackSpeed float64) Speaker { - synth := &macOSSynth{} - if playbackSpeed != conf.DefaultPlaybackSpeed { - const ( - maxRate = 300 * unit.Hertz - defaultRate = 180 * unit.Hertz - minRate = 120 * unit.Hertz - ) - var rate unit.Frequency - if playbackSpeed < 0 { - rate = maxRate - } else if playbackSpeed > conf.DefaultPlaybackSpeed { - rate = minRate - } else { - var shift unit.Frequency - if playbackSpeed < conf.DefaultPlaybackSpeed { - shift = unit.Frequency(playbackSpeed*(maxRate-defaultRate).Hertz()) * unit.Hertz - } else { - shift = unit.Frequency(1-playbackSpeed*(maxRate-defaultRate).Hertz()) * unit.Hertz - } - rate = defaultRate + shift - } - if !useSystemVoice { - synth.voice = "Samantha" - } - synth.rate = &rate - } - return synth -} - -// Say implements [Speaker.Say]. -func (s *macOSSynth) Say(ctx context.Context, text string) ([]float32, error) { - outFile, err := os.CreateTemp("", "skyeye-*.aiff") - if err != nil { - return nil, fmt.Errorf("failed to create temporary AIFF file: %w", err) - } - defer os.Remove(outFile.Name()) - - args := []string{"--output", outFile.Name()} - if s.voice != "" { - args = append(args, "--voice", s.voice) - } - if s.rate != nil { - args = append(args, "--rate", fmt.Sprintf("%.1f", s.rate.Hertz())) - } - args = append(args, text) - command := exec.CommandContext(ctx, "say", args...) - if err = command.Run(); err != nil { - return nil, fmt.Errorf("failed to execute 'say' command: %w", err) - } - - decoder := aiff.NewDecoder(outFile) - buf, err := decoder.FullPCMBuffer() - if err != nil { - return nil, fmt.Errorf("failed to decode AIFF file: %w", err) - } - f32 := buf.AsFloat32Buffer() - b := pcm.F32toS16LEBytes(f32.Data) - sample, err := downsample(b, unit.Frequency(decoder.SampleRate)*unit.Hertz) - if err != nil { - return nil, fmt.Errorf("failed to downsample audio: %w", err) - } - - f32le := pcm.S16LEBytesToF32LE(sample) - return f32le, nil -} diff --git a/pkg/synthesizer/speakers/piper.go b/pkg/synthesizer/speakers/piper.go deleted file mode 100644 index 29584114..00000000 --- a/pkg/synthesizer/speakers/piper.go +++ /dev/null @@ -1,52 +0,0 @@ -package speakers - -import ( - "context" - "fmt" - "time" - - asset "github.com/amitybell/piper-asset" - masculine "github.com/amitybell/piper-voice-alan" - feminine "github.com/amitybell/piper-voice-jenny" - "github.com/dharmab/skyeye/pkg/pcm" - "github.com/dharmab/skyeye/pkg/synthesizer/voices" - "github.com/martinlindhe/unit" - "github.com/nabbl/piper" -) - -type piperSynth struct { - tts *piper.TTS - speed float64 - pauseLength time.Duration -} - -var _ Speaker = (*piperSynth)(nil) - -// NewPiperSpeaker creates a Speaker powered by Piper (https://github.com/rhasspy/piper) -func NewPiperSpeaker(v voices.Voice, playbackSpeed float64, playbackPause time.Duration) (Speaker, error) { - var a asset.Asset - if v == voices.MasculineVoice { - a = masculine.Asset - } else { - a = feminine.Asset - } - tts, err := piper.New("", a) - if err != nil { - return nil, fmt.Errorf("failed to create speaker: %w", err) - } - return &piperSynth{tts: tts, speed: playbackSpeed, pauseLength: playbackPause}, nil -} - -// Say implements [Speaker.Say]. -func (s *piperSynth) Say(_ context.Context, text string) ([]float32, error) { - synthesized, err := s.tts.Synthesize(text, piper.WithSpeed(float32(s.speed)), piper.WithPause(float32(s.pauseLength.Seconds()))) - if err != nil { - return nil, fmt.Errorf("failed to synthesize text: %w", err) - } - downsampled, err := downsample(synthesized, 22050*unit.Hertz) - if err != nil { - return nil, fmt.Errorf("failed to downsample synthesized audio: %w", err) - } - f32le := pcm.S16LEBytesToF32LE(downsampled) - return f32le, nil -} diff --git a/pkg/synthesizer/speakers/speaker.go b/pkg/synthesizer/speakers/speaker.go index 10585fba..a54f72d8 100644 --- a/pkg/synthesizer/speakers/speaker.go +++ b/pkg/synthesizer/speakers/speaker.go @@ -4,7 +4,9 @@ package speakers import ( "bytes" "context" + "encoding/binary" "fmt" + "math" "github.com/dharmab/skyeye/pkg/pcm/rate" "github.com/martinlindhe/unit" @@ -17,18 +19,31 @@ type Speaker interface { Say(context.Context, string) ([]float32, error) } -func downsample(sample []byte, sourceRate unit.Frequency) ([]byte, error) { +// DownsampleF32 resamples F32LE PCM audio from sourceRate down to 16kHz wideband. +func DownsampleF32(samples []float32, sourceRate unit.Frequency) ([]float32, error) { + // Convert []float32 to []byte (F32LE) + input := make([]byte, len(samples)*4) + for i, s := range samples { + binary.LittleEndian.PutUint32(input[i*4:], math.Float32bits(s)) + } + const channels = 1 var buf bytes.Buffer - resampler, err := resample.New(&buf, sourceRate.Hertz(), rate.Wideband.Hertz(), channels, resample.I16, resample.LowQ) + resampler, err := resample.New(&buf, sourceRate.Hertz(), rate.Wideband.Hertz(), channels, resample.F32, resample.LowQ) if err != nil { return nil, fmt.Errorf("failed to create resampler: %w", err) } defer resampler.Close() - _, err = resampler.Write(sample) - if err != nil { + if _, err = resampler.Write(input); err != nil { return nil, fmt.Errorf("failed to resample synthesized audio: %w", err) } - return buf.Bytes(), nil + + // Convert []byte (F32LE) back to []float32 + output := buf.Bytes() + result := make([]float32, len(output)/4) + for i := range result { + result[i] = math.Float32frombits(binary.LittleEndian.Uint32(output[i*4:])) + } + return result, nil } diff --git a/pkg/synthesizer/voices/voices.go b/pkg/synthesizer/voices/voices.go deleted file mode 100644 index 68609fa9..00000000 --- a/pkg/synthesizer/voices/voices.go +++ /dev/null @@ -1,16 +0,0 @@ -// Package voices defines the available voices for the synthesizer package. -// This package is split from speakers to avoid pulling C dependencies into -// half of SkyEye's unit tests. -package voices - -// Voice for text-to-speech synthesis. -type Voice int - -const ( - // FeminineVoice is the "Jenny" en-GB voice. - // Origin: https://github.com/dioco-group/jenny-tts-dataset - FeminineVoice Voice = iota - // MasculineVoice is the "Alan" en-GB voice. - // Origin: https://popey.me - MasculineVoice -)