diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 64352d4..f58c079 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,24 @@ jobs: restore-keys: | ${{ runner.os }}-go- + - name: Install PortAudio (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y portaudio19-dev + + - name: Install PortAudio (macOS) + if: runner.os == 'macOS' + run: brew install portaudio + + - name: Install PortAudio (Windows) + if: runner.os == 'Windows' + run: | + vcpkg install portaudio:x64-windows + echo "PKG_CONFIG_PATH=C:/vcpkg/installed/x64-windows/lib/pkgconfig" >> $env:GITHUB_ENV + echo "CGO_CFLAGS=-IC:/vcpkg/installed/x64-windows/include" >> $env:GITHUB_ENV + echo "CGO_LDFLAGS=-LC:/vcpkg/installed/x64-windows/lib -lportaudio" >> $env:GITHUB_ENV + # Add DLL directory to PATH so tests can find portaudio.dll at runtime + echo "C:/vcpkg/installed/x64-windows/bin" >> $env:GITHUB_PATH + - name: Download dependencies run: go mod download @@ -58,6 +76,9 @@ jobs: with: go-version: "1.24.4" + - name: Install PortAudio + run: sudo apt-get update && sudo apt-get install -y portaudio19-dev + - name: Run golangci-lint uses: golangci/golangci-lint-action@v8 with: @@ -76,6 +97,9 @@ jobs: with: go-version: "1.24.4" + - name: Install PortAudio + run: sudo apt-get update && sudo apt-get install -y portaudio19-dev + - name: Build run: make build diff --git a/.gitignore b/.gitignore index 704d373..7b50a4f 100644 --- a/.gitignore +++ b/.gitignore @@ -61,4 +61,7 @@ Thumbs.db npm-wrapper/bin/ npm-wrapper/node_modules/ npm-wrapper/*.tgz -**/node_modules/ \ No newline at end of file +**/node_modules/ +# Ignore stray binaries +vapi-cli + diff --git a/Makefile b/Makefile index f67620c..dd88d53 100644 --- a/Makefile +++ b/Makefile @@ -129,6 +129,15 @@ lint: @echo "Running linters..." golangci-lint run +# Format Go code +fmt: + @echo "Formatting Go code..." + @$(GOCMD) fmt ./... + @echo "āœ… Go code formatted" + +# Alias +format: fmt + # Run all linters (CLI + MCP server) lint-all: lint lint-mcp @@ -195,6 +204,7 @@ help: @echo " man-pages Generate Unix manual pages" @echo " install Install the CLI and manual pages to ~/.local/" @echo " test Run CLI tests" + @echo " fmt Format Go code" @echo " lint Run CLI linters" @echo " clean Clean CLI build artifacts" @echo "" @@ -228,4 +238,4 @@ help: @echo " make version-set VERSION=1.2.3" @echo " make publish-mcp # Publish MCP server to npm" -.PHONY: all build build-mcp build-all test test-mcp test-all test-coverage clean clean-mcp clean-all tidy deps mcp-deps deps-all lint lint-mcp lint-all man-pages install install-mcp install-all run publish-mcp help \ No newline at end of file +.PHONY: all build build-mcp build-all test test-mcp test-all test-coverage clean clean-mcp clean-all tidy deps mcp-deps deps-all lint lint-mcp lint-all fmt format man-pages install install-mcp install-all run publish-mcp help \ No newline at end of file diff --git a/README.md b/README.md index 5439f6c..1d64c10 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,18 @@ iex ((New-Object System.Net.WebClient).DownloadString('https://vapi.ai/install.p Both scripts automatically detect your platform and install the latest version. +### Audio prerequisite (PortAudio) + +For voice features (microphone and speaker I/O), the CLI relies on the PortAudio runtime. Install it with your OS package manager: + +- macOS: `brew install portaudio` +- Debian/Ubuntu: `sudo apt-get update && sudo apt-get install -y libportaudio2 portaudio19-dev` +- Fedora/RHEL: `sudo dnf install -y portaudio portaudio-devel` +- Arch Linux: `sudo pacman -S portaudio` +- Windows: Install PortAudio and ensure `portaudio.dll` is on your PATH (e.g., via vcpkg: `vcpkg install portaudio`, or download the official binary and place the DLL alongside `vapi.exe`). + +If PortAudio is not installed, commands that use voice I/O (like `vapi call voice`) will fail at runtime. + ### Docker ```bash diff --git a/cmd/voice.go b/cmd/voice.go new file mode 100644 index 0000000..5b9bb82 --- /dev/null +++ b/cmd/voice.go @@ -0,0 +1,426 @@ +/* +Copyright Ā© 2025 Vapi, Inc. + +Licensed under the MIT License (the "License"); +you may not use this file except in compliance with the License. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +Authors: + + Dan Goosewin +*/ +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" + + vapi "github.com/VapiAI/server-sdk-go" + "github.com/spf13/cobra" + + "github.com/VapiAI/cli/pkg/voice" +) + +var ( + configFile string + audioInputDevice string + audioOutputDevice string + noVideo bool + callTimeout int + audioDebug bool + + // Transient assistant configuration + assistantName string + firstMessage string + voiceID string + model string + systemMessage string +) + +// Voice call management commands +var voiceCmd = &cobra.Command{ + Use: "voice [assistant-id]", + Short: "Start voice call with assistant", + Long: `Start a real-time voice call with a Vapi assistant. + +This command creates a WebSocket connection using Vapi's native transport, +enabling bidirectional audio streaming for natural conversations. + +You can either use an existing assistant ID or create a transient assistant +by specifying configuration flags. + +Voice Call Flow: + 1. Creates a call via Vapi's /call endpoint with WebSocket transport + 2. Establishes WebSocket connection to Vapi's audio transport + 3. Streams microphone audio to the assistant + 4. Plays assistant responses through speakers + +The VAPI_API_KEY will be used from your active CLI account configuration. + +Examples: + # Use existing assistant + vapi call voice asst_12345 + + # Create transient assistant inline + vapi call voice --name "My Assistant" --first-message "Hello! How can I help you?" + + # Advanced transient assistant + vapi call voice --name "Support Bot" --first-message "Hi there!" --voice-id "jennifer" --model "gpt-4o" + + # Load from config file + vapi call voice --config ./assistant.json`, + Args: cobra.MaximumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + var assistantID string + + // Determine if we're using an existing assistant or creating a transient one + if len(args) > 0 { + // Use existing assistant ID + assistantID = args[0] + } else if configFile != "" { + // Load assistant configuration from JSON file + // Clean the path to prevent directory traversal + cleanPath := filepath.Clean(configFile) + data, err := os.ReadFile(cleanPath) + if err != nil { + return fmt.Errorf("failed to read config file: %w", err) + } + + var config map[string]interface{} + if err := json.Unmarshal(data, &config); err != nil { + return fmt.Errorf("failed to parse config file: %w", err) + } + + // Check if config has existing assistant ID + if id, ok := config["assistant_id"].(string); ok { + assistantID = id + } else if id, ok := config["assistantId"].(string); ok { + assistantID = id + } else { + // No assistant ID found - create transient assistant from config + loadConfigIntoFlags(config) + + createdAssistantID, err := createTransientAssistant() + if err != nil { + return fmt.Errorf("failed to create transient assistant from config: %w", err) + } + assistantID = createdAssistantID + } + } else if assistantName != "" || firstMessage != "" { + // Create transient assistant + createdAssistantID, err := createTransientAssistant() + if err != nil { + return fmt.Errorf("failed to create transient assistant: %w", err) + } + assistantID = createdAssistantID + } else { + return fmt.Errorf("assistant ID is required (provide as argument, via --config, or via transient assistant flags like --name)") + } + + return startVoiceCall(assistantID) + }, +} + +var configureVoiceCmd = &cobra.Command{ + Use: "configure", + Short: "Configure voice call audio devices", + Long: `Configure audio input and output devices for voice calls.`, + RunE: func(cmd *cobra.Command, args []string) error { + fmt.Println("šŸŽ›ļø Voice Call Configuration") + fmt.Println() + + // Create device manager to list devices + deviceManager := voice.NewAudioDeviceManager() + if err := deviceManager.Initialize(); err != nil { + return fmt.Errorf("failed to initialize audio system: %w", err) + } + defer func() { + if err := deviceManager.Terminate(); err != nil { + fmt.Printf("Failed to terminate device manager: %v\n", err) + } + }() + + // List available devices + deviceList, err := deviceManager.ListDevices() + if err != nil { + return fmt.Errorf("failed to list audio devices: %w", err) + } + + fmt.Println("Available audio devices:") + fmt.Print(deviceList) + + fmt.Println("Configuration:") + fmt.Println("- Use device names with --audio-input and --audio-output flags") + fmt.Println("- Use 'default' to use system default devices") + fmt.Println() + fmt.Println("Example:") + fmt.Println(" vapi call voice asst_12345 --audio-input \"Built-in Microphone\"") + + return nil + }, +} + +var testAudioCmd = &cobra.Command{ + Use: "test-audio", + Short: "Test audio devices", + Long: `Test microphone and speaker functionality for voice calls.`, + RunE: func(cmd *cobra.Command, args []string) error { + fmt.Println("šŸŽ¤ Audio Test") + fmt.Println() + + // Create a basic audio stream to test devices + config := voice.DefaultWebRTCConfig() + if audioInputDevice != "" { + config.AudioInputDevice = audioInputDevice + } + if audioOutputDevice != "" { + config.AudioOutputDevice = audioOutputDevice + } + + audioStream, err := voice.NewAudioStream(config) + if err != nil { + return fmt.Errorf("failed to create audio stream: %w", err) + } + + fmt.Println("Testing audio devices...") + fmt.Printf("Input device: %s\n", config.AudioInputDevice) + fmt.Printf("Output device: %s\n", config.AudioOutputDevice) + fmt.Println() + + // Try to start the audio stream briefly + if err := audioStream.Start(); err != nil { + return fmt.Errorf("failed to start audio stream: %w", err) + } + + fmt.Println("āœ… Audio devices initialized successfully!") + fmt.Printf("Input device: %s\n", audioStream.GetInputDevice().Name) + fmt.Printf("Output device: %s\n", audioStream.GetOutputDevice().Name) + fmt.Println() + + // Test for a brief moment + fmt.Println("Testing audio for 3 seconds...") + time.Sleep(3 * time.Second) + + // Get audio levels + inputLevel, outputLevel := audioStream.GetInputLevel(), audioStream.GetOutputLevel() + fmt.Printf("Input level: %.1f%%\n", inputLevel*100) + fmt.Printf("Output level: %.1f%%\n", outputLevel*100) + + // Stop the audio stream + if err := audioStream.Stop(); err != nil { + fmt.Printf("Warning: %v\n", err) + } + + fmt.Println() + fmt.Println("āœ… Audio test completed!") + return nil + }, +} + +var statusVoiceCmd = &cobra.Command{ + Use: "status", + Short: "Show voice call status", + Long: `Display the status of the current voice call.`, + RunE: func(cmd *cobra.Command, args []string) error { + fmt.Println("šŸ“ž Voice Call Status") + fmt.Println() + fmt.Println("No active voice call.") + fmt.Println() + fmt.Println("Start a call with:") + fmt.Println(" vapi call voice ") + return nil + }, +} + +var endVoiceCmd = &cobra.Command{ + Use: "end", + Short: "End current voice call", + Long: `Terminate the current voice call.`, + RunE: func(cmd *cobra.Command, args []string) error { + fmt.Println("šŸ“ž End Voice Call") + fmt.Println() + fmt.Println("No active call to end.") + fmt.Println() + fmt.Println("Calls can be ended by pressing Ctrl+C during an active call.") + return nil + }, +} + +// loadConfigIntoFlags loads configuration from a JSON config into the flag variables +func loadConfigIntoFlags(config map[string]interface{}) { + // Load name + if name, ok := config["name"].(string); ok { + assistantName = name + } + + // Load first message + if msg, ok := config["first_message"].(string); ok { + firstMessage = msg + } else if msg, ok := config["firstMessage"].(string); ok { + firstMessage = msg + } + + // Load voice ID + if voiceValue, ok := config["voice_id"].(string); ok { + voiceID = voiceValue + } else if voiceValue, ok := config["voiceId"].(string); ok { + voiceID = voiceValue + } + + // Load model + if mdl, ok := config["model"].(string); ok { + model = mdl + } + + // Load system message + if sysMsg, ok := config["system_message"].(string); ok { + systemMessage = sysMsg + } else if sysMsg, ok := config["systemMessage"].(string); ok { + systemMessage = sysMsg + } +} + +// createTransientAssistant creates a temporary assistant for the voice call +func createTransientAssistant() (string, error) { + fmt.Println("šŸ¤– Creating transient assistant...") + + // Get Vapi client + if vapiClient.GetClient() == nil { + return "", fmt.Errorf("no active Vapi account found. Please run 'vapi login' first") + } + + // Set defaults if not provided + name := assistantName + if name == "" { + name = "Transient Assistant" + } + + message := firstMessage + if message == "" { + message = "Hello! How can I assist you today?" + } + + ctx := context.Background() + + // Create the assistant request + createRequest := &vapi.CreateAssistantDto{ + Name: &name, + FirstMessage: &message, + Voice: &vapi.CreateAssistantDtoVoice{ + VapiVoice: &vapi.VapiVoice{ + VoiceId: vapi.VapiVoiceVoiceIdElliot, // Default voice + }, + }, + } + + // Note: For now, we'll keep it simple and just use the default voice and model + // Advanced voice/model configuration can be added later once we understand the full API structure + if voiceID != "" { + fmt.Printf("ā„¹ļø Voice ID '%s' specified but using default voice for now\n", voiceID) + } + if model != "" { + fmt.Printf("ā„¹ļø Model '%s' specified but using default model for now\n", model) + } + if systemMessage != "" { + fmt.Printf("ā„¹ļø System message specified but using default behavior for now\n") + } + + // Create the assistant + assistant, err := vapiClient.GetClient().Assistants.Create(ctx, createRequest) + if err != nil { + return "", fmt.Errorf("failed to create transient assistant: %w", err) + } + + fmt.Printf("āœ… Created transient assistant: %s (ID: %s)\n", name, assistant.Id) + return assistant.Id, nil +} + +// startVoiceCall initiates a voice call with the specified assistant +func startVoiceCall(assistantID string) error { + fmt.Printf("šŸš€ Starting voice call with assistant: %s\n", assistantID) + fmt.Println() + + // Create voice call configuration + config := voice.DefaultWebRTCConfig() + + // Override with command line options + if audioInputDevice != "" { + config.AudioInputDevice = audioInputDevice + } + if audioOutputDevice != "" { + config.AudioOutputDevice = audioOutputDevice + } + config.VideoEnabled = !noVideo + config.AudioDebug = audioDebug + + // Get Vapi API configuration from the CLI client + if vapiClient.GetClient() == nil { + return fmt.Errorf("no active Vapi account found. Please run 'vapi login' first") + } + + // Set Vapi API key from the active account configuration + if apiKey := vapiClient.GetConfig().GetActiveAPIKey(); apiKey != "" { + config.VapiAPIKey = apiKey + } else { + return fmt.Errorf("VAPI_API_KEY not found. Please run 'vapi login' to authenticate") + } + + // Set API base URL from configuration + config.VapiBaseURL = vapiClient.GetConfig().GetAPIBaseURL() + + // Set public API key from environment if provided + if pub := os.Getenv("VAPI_PUBLIC_KEY"); pub != "" { + config.VapiPublicAPIKey = pub + } + + // Create voice client + client, err := voice.NewVoiceClient(config, vapiClient.GetClient()) + if err != nil { + return fmt.Errorf("failed to create voice client: %w", err) + } + + // Create terminal UI + ui := voice.NewTerminalUI(client) + + // Start the call + if err := client.StartCall(assistantID); err != nil { + return fmt.Errorf("failed to start voice call: %w", err) + } + + // Run the terminal UI (this blocks until call ends) + return ui.Run() +} + +func init() { + // Add voice as a subcommand of call + callCmd.AddCommand(voiceCmd) + voiceCmd.AddCommand(configureVoiceCmd) + voiceCmd.AddCommand(testAudioCmd) + voiceCmd.AddCommand(statusVoiceCmd) + voiceCmd.AddCommand(endVoiceCmd) + + // Add flags to the main voice command + voiceCmd.Flags().StringVar(&configFile, "config", "", "Path to assistant configuration JSON file") + voiceCmd.Flags().StringVar(&audioInputDevice, "audio-input", "", "Audio input device name") + voiceCmd.Flags().StringVar(&audioOutputDevice, "audio-output", "", "Audio output device name") + voiceCmd.Flags().IntVar(&callTimeout, "timeout", 30, "Call timeout in minutes") + voiceCmd.Flags().BoolVar(&audioDebug, "audio-debug", false, "Enable audio debugging (saves input/output to WAV files)") + + // Transient assistant flags + voiceCmd.Flags().StringVar(&assistantName, "name", "", "Name for transient assistant") + voiceCmd.Flags().StringVar(&firstMessage, "first-message", "", "First message from transient assistant") + voiceCmd.Flags().StringVar(&voiceID, "voice-id", "", "Voice ID for transient assistant (jennifer, derek, elliot)") + voiceCmd.Flags().StringVar(&model, "model", "", "AI model for transient assistant (gpt-4o, gpt-4o-mini, etc.)") + voiceCmd.Flags().StringVar(&systemMessage, "system-message", "", "System message for transient assistant") +} diff --git a/DEVELOPMENT.md b/docs/DEVELOPMENT.md similarity index 100% rename from DEVELOPMENT.md rename to docs/DEVELOPMENT.md diff --git a/RELEASING.md b/docs/RELEASING.md similarity index 100% rename from RELEASING.md rename to docs/RELEASING.md diff --git a/docs/WEBRTC_IMPLEMENTATION_PLAN.md b/docs/WEBRTC_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..0df60ef --- /dev/null +++ b/docs/WEBRTC_IMPLEMENTATION_PLAN.md @@ -0,0 +1,452 @@ +# WebRTC Call Implementation Plan for Vapi CLI + +## Overview +This document outlines the implementation plan for adding WebRTC calling functionality to the Vapi CLI using Pion WebRTC library and Daily.co as an intermediary service. + +## Architecture + +### High-Level Components +1. **CLI Command Interface** - New `vapi webrtc` command group +2. **WebRTC Client** - Pion-based WebRTC implementation +3. **Daily.co Integration** - Room management and signaling via Daily.co API +4. **Vapi Integration** - Connect with existing Vapi assistant/call infrastructure +5. **Audio/Video Pipeline** - Handle media streams for voice/video calls + +### Technology Stack +- **WebRTC Library**: Pion WebRTC v3 (github.com/pion/webrtc/v3) +- **Signaling Service**: Daily.co API +- **HTTP Client**: Standard Go net/http or existing client in codebase +- **Audio Processing**: Pion's built-in audio codecs (Opus, PCM) +- **CLI Framework**: Cobra (already in use) + +## Debug Webhook System (Using Existing Infrastructure) + +### Integration with Existing `vapi listen` Command +The WebRTC implementation will leverage the existing robust webhook infrastructure: + +```go +type WebRTCDebugger struct { + webhookURL string // URL for vapi listen forwarding + events chan WebhookEvent + ui *TerminalUI + callID string // Track specific WebRTC call events +} + +type WebhookEvent struct { + Timestamp time.Time `json:"timestamp"` + Type string `json:"type"` // From existing webhook types + CallID string `json:"call_id"` + Data interface{} `json:"data"` + SessionID string `json:"session_id"` +} +``` + +### Debug Integration Modes +1. **Auto-Start Listen Server** (`--debug`) + - Automatically launches `vapi listen --forward-to localhost:3000/webhook` + - Integrates webhook events into WebRTC terminal UI + - Filters events by call ID for relevant debugging + +2. **External Webhook Integration** (`--debug-webhook `) + - Uses existing webhook forwarding to external URL + - Leverages existing authentication and retry logic + - Maintains compatibility with current webhook tooling + +3. **Existing File Logging** + - Uses existing structured logging from `vapi listen` + - Filters WebRTC-specific events for analysis + +### Command Integration Examples +```bash +# WebRTC call with auto-debug (leverages existing listen command) +vapi call webrtc asst_12345 --debug +# Internally runs: vapi listen --forward-to localhost:3000/debug & + +# WebRTC call with external webhook (uses existing infrastructure) +vapi call webrtc asst_12345 --debug-webhook http://localhost:8080/webhook + +# WebRTC call with JSON config and debug +vapi call webrtc --config ./assistant.json --debug + +# Manual setup using existing commands +vapi listen --forward-to localhost:3000/webhook & +vapi call webrtc asst_12345 +``` + +### Terminal Flow Integration +```go +type TerminalUI struct { + callStatus *CallStatusView + debugPanel *DebugPanelView + audioLevels *AudioLevelsView + controls *ControlsView +} + +// Real-time terminal layout +ā”Œā”€ Call Status ────────────────────────────────────────────────────┐ +│ 🟢 Connected to: Daily Room "test-call-1234" │ +│ šŸ‘¤ Participants: You, Vapi Assistant │ +│ ā±ļø Duration: 00:02:34 │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +ā”Œā”€ Audio Levels ───────────────────────────────────────────────────┐ +│ šŸŽ¤ Input: ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–’ā–’ 80% │ +│ šŸ”Š Output: ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–’ā–’ā–’ā–’ 60% │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +ā”Œā”€ Debug Events ───────────────────────────────────────────────────┐ +│ [14:23:45] POST /v1/calls → 201 Created │ +│ [14:23:46] GET /v1/assistants/asst_123 → 200 OK │ +│ [14:23:47] WebSocket: connection established │ +│ [14:23:48] WebRTC: ICE candidate received │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +ā”Œā”€ Controls ───────────────────────────────────────────────────────┐ +│ [m] Mute [h] Hang up [d] Toggle debug [q] Quit │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +## Terminal Flow Design + +### Non-blocking Terminal UI +```go +type CallInterface struct { + done chan bool + keyEvents chan rune + uiUpdates chan UIUpdate + callEvents chan CallEvent +} + +// Goroutine structure +func (c *CallInterface) Run() { + go c.handleKeyInput() // Non-blocking keyboard input + go c.handleCallEvents() // WebRTC/Vapi event processing + go c.handleUIUpdates() // Terminal display updates + go c.handleWebhookEvents() // Debug webhook processing + + // Main event loop + for { + select { + case key := <-c.keyEvents: + c.handleKeyPress(key) + case event := <-c.callEvents: + c.updateCallStatus(event) + case update := <-c.uiUpdates: + c.refreshDisplay(update) + case <-c.done: + return + } + } +} +``` + +### Key Controls During Call +- `m`: Toggle mute/unmute +- `h`: Hang up call +- `d`: Toggle debug panel visibility +- `v`: Adjust volume levels +- `r`: Start/stop recording +- `t`: Show call transcript +- `q`: Quit (with confirmation) +- `↑/↓`: Scroll through debug events + +### Terminal State Management +```go +type TerminalState struct { + mode DisplayMode // Normal, Debug, Transcript + callActive bool + muted bool + recording bool + debugVisible bool + scrollPos int +} +``` + +## Implementation Phases + +### Phase 1: Core Infrastructure +1. **Add Dependencies** + ```go + // Add to go.mod + github.com/pion/webrtc/v3 v3.x.x + github.com/pion/interceptor v0.x.x + ``` + +2. **Create WebRTC Package Structure** + ``` + pkg/webrtc/ + ā”œā”€ā”€ client.go // Main WebRTC client + ā”œā”€ā”€ daily.go // Daily.co API integration + ā”œā”€ā”€ signaling.go // WebRTC signaling handling + ā”œā”€ā”€ media.go // Audio/video stream management + ā”œā”€ā”€ config.go // WebRTC configuration + ā”œā”€ā”€ audio.go // PortAudio integration + ā”œā”€ā”€ devices.go // Audio device management + ā”œā”€ā”€ api.go // Vapi API request/response handling + ā”œā”€ā”€ terminal.go // Terminal UI management + └── diagnostics.go // Connection diagnostics + ``` + +3. **Daily.co Integration** + ```go + type DailyClient struct { + apiKey string + domain string + httpClient *http.Client + } + + type Room struct { + Name string `json:"name"` + URL string `json:"url"` + Config *RoomConfig `json:"config,omitempty"` + CreatedAt time.Time `json:"created_at"` + Privacy string `json:"privacy"` // "public" | "private" + } + + type RoomConfig struct { + MaxParticipants int `json:"max_participants"` + EnableChat bool `json:"enable_chat"` + EnableRecording bool `json:"enable_recording"` + AudioOnly bool `json:"audio_only"` + } + + // Room management methods + func (d *DailyClient) CreateRoom(name string, config *RoomConfig) (*Room, error) + func (d *DailyClient) GetRoom(name string) (*Room, error) + func (d *DailyClient) DeleteRoom(name string) error + func (d *DailyClient) GenerateToken(roomName string, props *TokenProperties) (string, error) + ``` + + **Authentication Flow:** + 1. Create room via Daily.co REST API with API key + 2. Generate meeting token for secure room access + 3. Connect to Daily.co WebSocket with token + 4. Handle room events and participant management + +### Phase 2: CLI Commands +Add new command group under existing `call` command: + +``` +vapi call webrtc [options] // Start WebRTC call with assistant +vapi call webrtc --config [options] // Start with JSON config +vapi call webrtc configure // Configure audio devices +vapi call webrtc test-audio // Test microphone/speakers +vapi call webrtc status // Show current call status +vapi call webrtc end // End current WebRTC call +vapi call webrtc diagnostics // Connection diagnostics +``` + +**Primary Command Usage:** +```bash +# Start call with assistant ID +vapi call webrtc asst_12345 --debug-webhook http://localhost:3000/webhook + +# Start call with JSON config +vapi call webrtc --config ./my-assistant.json --debug +``` + +**Command Flags:** +- `--room-name`: Custom room name (default: auto-generated) +- `--debug-webhook`: URL to receive debug request/response data +- `--debug`: Enable debug mode with local webhook server +- `--audio-input`: Specific audio input device +- `--audio-output`: Specific audio output device +- `--config`: Assistant configuration JSON file +- `--no-video`: Audio-only mode +- `--record`: Enable call recording + +### Phase 3: WebRTC Implementation +1. **Peer Connection Setup** + - Initialize Pion WebRTC peer connection + - Configure ICE servers and STUN/TURN + - Handle offer/answer exchange via Daily.co + +2. **Media Handling** + - Audio input/output (microphone/speakers) + - Optional video support + - Integration with Vapi's voice processing + +3. **Signaling Protocol** + - WebSocket connection to Daily.co + - Handle ICE candidates exchange + - Room state management + +### Phase 4: Vapi Integration +1. **Assistant Connection** + - Route audio to/from Vapi assistant + - Handle call events and state changes + - Integrate with existing Vapi call infrastructure + +2. **Call Management** + - Link WebRTC calls with Vapi call records + - Transcript and recording integration + - Billing and analytics + +## File Structure Changes + +### New Files to Create +``` +cmd/webrtc.go // WebRTC CLI commands +pkg/webrtc/client.go // Main WebRTC client +pkg/webrtc/daily.go // Daily.co API client +pkg/webrtc/signaling.go // WebRTC signaling +pkg/webrtc/media.go // Media stream handling +pkg/webrtc/config.go // Configuration +pkg/webrtc/audio.go // PortAudio integration +pkg/webrtc/devices.go // Audio device management +pkg/webrtc/api.go // Vapi API request/response handling +pkg/webrtc/terminal.go // Terminal UI management +pkg/webrtc/diagnostics.go // Connection diagnostics +``` + +### Modified Files +``` +cmd/call.go // Add WebRTC subcommands +go.mod // Add Pion WebRTC dependencies +``` + +## Dependencies + +### Required Go Modules +```go +// Core WebRTC +github.com/pion/webrtc/v3 v3.2.40 // Core WebRTC implementation +github.com/pion/interceptor v0.1.25 // WebRTC interceptors +github.com/pion/opus v0.4.0 // Opus audio codec +github.com/pion/rtp v1.8.2 // RTP packet handling + +// Audio System +github.com/gordonklaus/portaudio latest // Cross-platform audio I/O +github.com/yourusername/go-audio latest // Audio format conversion + +// Networking +github.com/gorilla/websocket v1.5.1 // WebSocket for signaling + +// Utilities +github.com/google/uuid v1.6.0 // Room ID generation +github.com/fatih/color v1.15.0 // Terminal colors for status +``` + +### Daily.co API Requirements +- Daily.co API key for room management +- WebSocket endpoint for real-time signaling +- REST API for room creation/management + +## Configuration + +### Environment Variables +```bash +DAILY_API_KEY=your_daily_api_key +DAILY_DOMAIN=your_daily_domain.daily.co +WEBRTC_STUN_SERVERS=stun:stun.l.google.com:19302 +WEBRTC_TURN_SERVERS=turn:your-turn-server.com +WEBRTC_AUDIO_INPUT_DEVICE=default +WEBRTC_AUDIO_OUTPUT_DEVICE=default +``` + +### CLI Configuration +Extend existing config.go to include: +```go +type WebRTCConfig struct { + DailyAPIKey string `mapstructure:"daily_api_key"` + DailyDomain string `mapstructure:"daily_domain"` + STUNServers []string `mapstructure:"stun_servers"` + TURNServers []string `mapstructure:"turn_servers"` + AudioCodec string `mapstructure:"audio_codec"` // opus, pcm + VideoEnabled bool `mapstructure:"video_enabled"` +} +``` + +## Implementation Steps + +### Step 1: Setup and Dependencies +1. Add Pion WebRTC to go.mod +2. Create basic pkg/webrtc package structure +3. Implement Daily.co API client for room management + +### Step 2: CLI Commands +1. Create cmd/webrtc.go with basic command structure +2. Implement room creation and joining commands +3. Add configuration handling for Daily.co credentials + +### Step 3: WebRTC Core +1. Implement basic peer connection setup +2. Add signaling via Daily.co WebSocket +3. Handle offer/answer exchange and ICE candidates + +### Step 4: Media Pipeline +1. **Audio Device Setup** + ```go + // Initialize PortAudio + portaudio.Initialize() + defer portaudio.Terminate() + + // Enumerate audio devices + devices, err := portaudio.Devices() + ``` + +2. **Audio Input Pipeline** + ```go + // Microphone -> PCM Buffer -> Opus Encoder -> WebRTC Track + inputStream := setupAudioInput(selectedDevice) + opusEncoder := opus.NewEncoder(48000, 1, opus.AppVoIP) + audioTrack := setupWebRTCAudioTrack() + ``` + +3. **Audio Output Pipeline** + ```go + // WebRTC Track -> Opus Decoder -> PCM Buffer -> Speakers + outputStream := setupAudioOutput(selectedDevice) + opusDecoder := opus.NewDecoder(48000, 1) + ``` + +4. **Route audio to/from Vapi assistant** + - Bidirectional audio stream routing + - Real-time audio processing and forwarding + +### Step 5: Integration and Testing +1. Connect WebRTC calls with Vapi call management +2. Add call state tracking and events +3. Test end-to-end call scenarios + +## Security Considerations + +1. **API Key Management**: Secure storage of Daily.co API keys +2. **Media Encryption**: Ensure DTLS/SRTP encryption is enabled +3. **Authentication**: Validate room access and user permissions +4. **Network Security**: Proper STUN/TURN server configuration + +## Testing Strategy + +1. **Unit Tests**: Individual component testing +2. **Integration Tests**: Daily.co API integration +3. **End-to-End Tests**: Full call scenarios +4. **Performance Tests**: Media quality and latency + +## Potential Challenges + +1. **Audio Routing**: Complex audio pipeline between WebRTC and Vapi +2. **NAT Traversal**: STUN/TURN server configuration +3. **Cross-Platform**: Audio device handling across different OS +4. **Error Handling**: Robust connection failure recovery +5. **Synchronization**: Managing call state between WebRTC and Vapi + +## Success Metrics + +1. Successful peer-to-peer connection establishment +2. Clear audio quality with low latency +3. Reliable connection through NAT/firewalls +4. Seamless integration with existing Vapi workflows +5. Proper call state management and recording + +## Future Enhancements + +1. **Video Support**: Add video calling capabilities +2. **Screen Sharing**: Implement screen sharing via WebRTC +3. **Multi-party Calls**: Support for conference calls +4. **Recording**: Direct WebRTC call recording +5. **Mobile Support**: Extend to mobile platforms via Go Mobile + +## Resources + +- [Pion WebRTC Documentation](https://pkg.go.dev/github.com/pion/webrtc/v3) +- [Daily.co API Documentation](https://docs.daily.co/) +- [WebRTC Standards](https://webrtc.org/) +- [Pion Examples](https://github.com/pion/example-webrtc-applications) \ No newline at end of file diff --git a/go.mod b/go.mod index 87ba042..599db39 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,8 @@ require ( github.com/AlecAivazis/survey/v2 v2.3.7 github.com/VapiAI/server-sdk-go v0.9.0 github.com/charmbracelet/lipgloss v1.1.0 + github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b + github.com/gorilla/websocket v1.5.1 github.com/posthog/posthog-go v1.5.12 github.com/spf13/cobra v1.9.1 github.com/spf13/viper v1.20.1 @@ -33,6 +35,23 @@ require ( github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/pion/datachannel v1.5.10 // indirect + github.com/pion/dtls/v3 v3.0.6 // indirect + github.com/pion/ice/v4 v4.0.10 // indirect + github.com/pion/interceptor v0.1.40 // indirect + github.com/pion/logging v0.2.4 // indirect + github.com/pion/mdns/v2 v2.0.7 // indirect + github.com/pion/mediadevices v0.7.1 // indirect + github.com/pion/randutil v0.1.0 // indirect + github.com/pion/rtcp v1.2.15 // indirect + github.com/pion/rtp v1.8.20 // indirect + github.com/pion/sctp v1.8.39 // indirect + github.com/pion/sdp/v3 v3.0.14 // indirect + github.com/pion/srtp/v3 v3.0.6 // indirect + github.com/pion/stun/v3 v3.0.0 // indirect + github.com/pion/transport/v3 v3.0.7 // indirect + github.com/pion/turn/v4 v4.0.0 // indirect + github.com/pion/webrtc/v4 v4.1.3 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect @@ -42,8 +61,12 @@ require ( github.com/spf13/cast v1.9.2 // indirect github.com/spf13/pflag v1.0.6 // indirect github.com/subosito/gotenv v1.6.0 // indirect + github.com/wlynxg/anet v0.0.5 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect go.uber.org/multierr v1.11.0 // indirect + golang.org/x/crypto v0.33.0 // indirect + golang.org/x/image v0.23.0 // indirect + golang.org/x/net v0.35.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.32.0 // indirect golang.org/x/text v0.26.0 // indirect diff --git a/go.sum b/go.sum index 4beae78..0de233f 100644 --- a/go.sum +++ b/go.sum @@ -33,6 +33,10 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxelOGHA6z9lABqaMLMrfwVyMdN3UgRLT+YUPo= +github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco= +github.com/gorilla/websocket v1.5.1 h1:gmztn0JnHVt9JZquRuzLw3g4wouNVzKL15iLr/zn/QY= +github.com/gorilla/websocket v1.5.1/go.mod h1:x3kM2JMyaluk02fnUJpQuwD2dCS5NDG2ZHL0uE0tcaY= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hinshun/vt10x v0.0.0-20220119200601-820417d04eec h1:qv2VnGeEQHchGaZ/u7lxST/RaJw+cv273q79D81Xbog= @@ -62,6 +66,58 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/pion/datachannel v1.5.10 h1:ly0Q26K1i6ZkGf42W7D4hQYR90pZwzFOjTq5AuCKk4o= +github.com/pion/datachannel v1.5.10/go.mod h1:p/jJfC9arb29W7WrxyKbepTU20CFgyx5oLo8Rs4Py/M= +github.com/pion/dtls/v3 v3.0.4 h1:44CZekewMzfrn9pmGrj5BNnTMDCFwr+6sLH+cCuLM7U= +github.com/pion/dtls/v3 v3.0.4/go.mod h1:R373CsjxWqNPf6MEkfdy3aSe9niZvL/JaKlGeFphtMg= +github.com/pion/dtls/v3 v3.0.6 h1:7Hkd8WhAJNbRgq9RgdNh1aaWlZlGpYTzdqjy9x9sK2E= +github.com/pion/dtls/v3 v3.0.6/go.mod h1:iJxNQ3Uhn1NZWOMWlLxEEHAN5yX7GyPvvKw04v9bzYU= +github.com/pion/ice/v4 v4.0.6 h1:jmM9HwI9lfetQV/39uD0nY4y++XZNPhvzIPCb8EwxUM= +github.com/pion/ice/v4 v4.0.6/go.mod h1:y3M18aPhIxLlcO/4dn9X8LzLLSma84cx6emMSu14FGw= +github.com/pion/ice/v4 v4.0.10 h1:P59w1iauC/wPk9PdY8Vjl4fOFL5B+USq1+xbDcN6gT4= +github.com/pion/ice/v4 v4.0.10/go.mod h1:y3M18aPhIxLlcO/4dn9X8LzLLSma84cx6emMSu14FGw= +github.com/pion/interceptor v0.1.37 h1:aRA8Zpab/wE7/c0O3fh1PqY0AJI3fCSEM5lRWJVorwI= +github.com/pion/interceptor v0.1.37/go.mod h1:JzxbJ4umVTlZAf+/utHzNesY8tmRkM2lVmkS82TTj8Y= +github.com/pion/interceptor v0.1.40 h1:e0BjnPcGpr2CFQgKhrQisBU7V3GXK6wrfYrGYaU6Jq4= +github.com/pion/interceptor v0.1.40/go.mod h1:Z6kqH7M/FYirg3frjGJ21VLSRJGBXB/KqaTIrdqnOic= +github.com/pion/logging v0.2.3 h1:gHuf0zpoh1GW67Nr6Gj4cv5Z9ZscU7g/EaoC/Ke/igI= +github.com/pion/logging v0.2.3/go.mod h1:z8YfknkquMe1csOrxK5kc+5/ZPAzMxbKLX5aXpbpC90= +github.com/pion/logging v0.2.4 h1:tTew+7cmQ+Mc1pTBLKH2puKsOvhm32dROumOZ655zB8= +github.com/pion/logging v0.2.4/go.mod h1:DffhXTKYdNZU+KtJ5pyQDjvOAh/GsNSyv1lbkFbe3so= +github.com/pion/mdns/v2 v2.0.7 h1:c9kM8ewCgjslaAmicYMFQIde2H9/lrZpjBkN8VwoVtM= +github.com/pion/mdns/v2 v2.0.7/go.mod h1:vAdSYNAT0Jy3Ru0zl2YiW3Rm/fJCwIeM0nToenfOJKA= +github.com/pion/mediadevices v0.7.1 h1:ayMneLx1ymJr0rVRn01foqu8LO/FQ97MS1IKM/XgpuY= +github.com/pion/mediadevices v0.7.1/go.mod h1:89jObwFJ4IkL2vkaN8Gq9tSjp0jAY4JtTJ84Ix+QODQ= +github.com/pion/randutil v0.1.0 h1:CFG1UdESneORglEsnimhUjf33Rwjubwj6xfiOXBa3mA= +github.com/pion/randutil v0.1.0/go.mod h1:XcJrSMMbbMRhASFVOlj/5hQial/Y8oH/HVo7TBZq+j8= +github.com/pion/rtcp v1.2.15 h1:LZQi2JbdipLOj4eBjK4wlVoQWfrZbh3Q6eHtWtJBZBo= +github.com/pion/rtcp v1.2.15/go.mod h1:jlGuAjHMEXwMUHK78RgX0UmEJFV4zUKOFHR7OP+D3D0= +github.com/pion/rtp v1.8.11 h1:17xjnY5WO5hgO6SD3/NTIUPvSFw/PbLsIJyz1r1yNIk= +github.com/pion/rtp v1.8.11/go.mod h1:8uMBJj32Pa1wwx8Fuv/AsFhn8jsgw+3rUC2PfoBZ8p4= +github.com/pion/rtp v1.8.20 h1:8zcyqohadZE8FCBeGdyEvHiclPIezcwRQH9zfapFyYI= +github.com/pion/rtp v1.8.20/go.mod h1:bAu2UFKScgzyFqvUKmbvzSdPr+NGbZtv6UB2hesqXBk= +github.com/pion/sctp v1.8.35 h1:qwtKvNK1Wc5tHMIYgTDJhfZk7vATGVHhXbUDfHbYwzA= +github.com/pion/sctp v1.8.35/go.mod h1:EcXP8zCYVTRy3W9xtOF7wJm1L1aXfKRQzaM33SjQlzg= +github.com/pion/sctp v1.8.39 h1:PJma40vRHa3UTO3C4MyeJDQ+KIobVYRZQZ0Nt7SjQnE= +github.com/pion/sctp v1.8.39/go.mod h1:cNiLdchXra8fHQwmIoqw0MbLLMs+f7uQ+dGMG2gWebE= +github.com/pion/sdp/v3 v3.0.10 h1:6MChLE/1xYB+CjumMw+gZ9ufp2DPApuVSnDT8t5MIgA= +github.com/pion/sdp/v3 v3.0.10/go.mod h1:88GMahN5xnScv1hIMTqLdu/cOcUkj6a9ytbncwMCq2E= +github.com/pion/sdp/v3 v3.0.14 h1:1h7gBr9FhOWH5GjWWY5lcw/U85MtdcibTyt/o6RxRUI= +github.com/pion/sdp/v3 v3.0.14/go.mod h1:88GMahN5xnScv1hIMTqLdu/cOcUkj6a9ytbncwMCq2E= +github.com/pion/srtp/v3 v3.0.4 h1:2Z6vDVxzrX3UHEgrUyIGM4rRouoC7v+NiF1IHtp9B5M= +github.com/pion/srtp/v3 v3.0.4/go.mod h1:1Jx3FwDoxpRaTh1oRV8A/6G1BnFL+QI82eK4ms8EEJQ= +github.com/pion/srtp/v3 v3.0.6 h1:E2gyj1f5X10sB/qILUGIkL4C2CqK269Xq167PbGCc/4= +github.com/pion/srtp/v3 v3.0.6/go.mod h1:BxvziG3v/armJHAaJ87euvkhHqWe9I7iiOy50K2QkhY= +github.com/pion/stun/v3 v3.0.0 h1:4h1gwhWLWuZWOJIJR9s2ferRO+W3zA/b6ijOI6mKzUw= +github.com/pion/stun/v3 v3.0.0/go.mod h1:HvCN8txt8mwi4FBvS3EmDghW6aQJ24T+y+1TKjB5jyU= +github.com/pion/transport/v3 v3.0.7 h1:iRbMH05BzSNwhILHoBoAPxoB9xQgOaJk+591KC9P1o0= +github.com/pion/transport/v3 v3.0.7/go.mod h1:YleKiTZ4vqNxVwh77Z0zytYi7rXHl7j6uPLGhhz9rwo= +github.com/pion/turn/v4 v4.0.0 h1:qxplo3Rxa9Yg1xXDxxH8xaqcyGUtbHYw4QSCvmFWvhM= +github.com/pion/turn/v4 v4.0.0/go.mod h1:MuPDkm15nYSklKpN8vWJ9W2M0PlyQZqYt1McGuxG7mA= +github.com/pion/webrtc/v4 v4.0.9 h1:PyOYMRKJgfy0dzPcYtFD/4oW9zaw3Ze3oZzzbj2LV9E= +github.com/pion/webrtc/v4 v4.0.9/go.mod h1:ViHLVaNpiuvaH8pdiuQxuA9awuE6KVzAXx3vVWilOck= +github.com/pion/webrtc/v4 v4.1.3 h1:YZ67Boj9X/hk190jJZ8+HFGQ6DqSZ/fYP3sLAZv7c3c= +github.com/pion/webrtc/v4 v4.1.3/go.mod h1:rsq+zQ82ryfR9vbb0L1umPJ6Ogq7zm8mcn9fcGnxomM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posthog/posthog-go v1.5.12 h1:nxK/z5QLCFxwzxV8GNvVd4Y1wJ++zJSWMGEtzU+/HLM= @@ -93,6 +149,8 @@ github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOf github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/wlynxg/anet v0.0.5 h1:J3VJGi1gvo0JwZ/P1/Yc/8p63SoW98B5dHkYDmpgvvU= +github.com/wlynxg/anet v0.0.5/go.mod h1:eay5PRQr7fIVAMbTbchTnO9gG65Hg/uYGdc7mguHxoA= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= @@ -100,12 +158,24 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= +golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= golang.org/x/exp v0.0.0-20220909182711-5c715a9e8561 h1:MDc5xs78ZrZr3HMQugiXOAkSZtfTpbJLDr/lwfgO53E= golang.org/x/exp v0.0.0-20220909182711-5c715a9e8561/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE= +golang.org/x/image v0.23.0 h1:HseQ7c2OpPKTPVzNjG5fwJsOTCiiwS4QdsYi5XU6H68= +golang.org/x/image v0.23.0/go.mod h1:wJJBTdLfCCf3tiHa1fNxpZmUI4mmoZvwMCPP0ddoNKY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= +golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/mcp-docs-server/src/resources/documentation.ts b/mcp-docs-server/src/resources/documentation.ts index 4472d44..67e862b 100644 --- a/mcp-docs-server/src/resources/documentation.ts +++ b/mcp-docs-server/src/resources/documentation.ts @@ -7,19 +7,19 @@ export class DocumentationSource { switch (uri) { case "vapi://docs/overview": return this.getDocumentationOverview(); - + case "vapi://docs/quickstart": return this.getQuickStartGuide(); - + case "vapi://examples/collection": return this.getExamplesCollection(); - + case "vapi://api/reference": return this.getApiReference(); - + case "vapi://changelog/latest": return this.getLatestChanges(); - + default: throw new Error(`Unknown resource URI: ${uri}`); } @@ -184,7 +184,7 @@ const assistant = await vapi.assistants.create({ }, model: { provider: "openai", - model: "gpt-3.5-turbo", + model: "gpt-4o", messages: [{ role: "system", content: "You are a helpful assistant. Be concise and friendly." @@ -209,7 +209,7 @@ assistant = vapi.assistants.create( }, model={ "provider": "openai", - "model": "gpt-3.5-turbo", + "model": "gpt-4o", "messages": [{ "role": "system", "content": "You are a helpful assistant. Be concise and friendly." @@ -332,7 +332,7 @@ const assistant = await vapi.assistants.create({ voice: { provider: "openai", voiceId: "alloy" }, model: { provider: "openai", - model: "gpt-3.5-turbo", + model: "gpt-4o", messages: [{ role: "system", content: "You are a helpful assistant." }] } }); @@ -351,7 +351,7 @@ const supportBot = await vapi.assistants.create({ voice: { provider: "openai", voiceId: "echo" }, model: { provider: "openai", - model: "gpt-4", + model: "gpt-4o", messages: [{ role: "system", content: \`You are a customer support representative for Acme Corp. @@ -394,7 +394,7 @@ const assistant = await vapi.assistants.create({ voice: { provider: "elevenlabs", voiceId: "21m00Tcm4TlvDq8ikWAM" }, model: { provider: "openai", - model: "gpt-4", + model: "gpt-4o", messages: [{ role: "system", content: "You are a smart assistant that can help with weather and scheduling." @@ -531,7 +531,7 @@ def create_assistant(): }, model={ "provider": "openai", - "model": "gpt-3.5-turbo", + "model": "gpt-4o", "messages": [{ "role": "system", "content": data['system_prompt'] diff --git a/pkg/config/config.go b/pkg/config/config.go index a02c568..0085df7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -63,8 +63,8 @@ var environments = map[string]Environment{ }, "staging": { Name: "staging", - APIBaseURL: "https://api.staging.vapi.ai", - DashboardURL: "https://dashboard.staging.vapi.ai", + APIBaseURL: "https://staging-api.vapi.ai", + DashboardURL: "https://staging-dashboard.vapi.ai", }, "development": { Name: "development", diff --git a/pkg/voice/api.go b/pkg/voice/api.go new file mode 100644 index 0000000..3a606c3 --- /dev/null +++ b/pkg/voice/api.go @@ -0,0 +1,77 @@ +package voice + +import ( + "time" +) + +// APIHandler manages API request/response logging and handling +type APIHandler struct { + client *VoiceClient + requestLog chan APIRequest + responseLog chan APIResponse +} + +// NewAPIHandler creates a new API handler +func NewAPIHandler(client *VoiceClient) *APIHandler { + return &APIHandler{ + client: client, + requestLog: make(chan APIRequest, 100), + responseLog: make(chan APIResponse, 100), + } +} + +// LogRequest logs an API request +func (h *APIHandler) LogRequest(method, url string, headers map[string]string, body interface{}) { + req := APIRequest{ + Method: method, + URL: url, + Headers: headers, + Body: body, + Timestamp: time.Now(), + } + + select { + case h.requestLog <- req: + default: + // Channel full, skip logging + } +} + +// LogResponse logs an API response +func (h *APIHandler) LogResponse(statusCode int, headers map[string]string, body interface{}, duration time.Duration) { + resp := APIResponse{ + StatusCode: statusCode, + Headers: headers, + Body: body, + Duration: duration, + Timestamp: time.Now(), + } + + select { + case h.responseLog <- resp: + default: + // Channel full, skip logging + } +} + +// GetRequestLog returns the request log channel +func (h *APIHandler) GetRequestLog() <-chan APIRequest { + return h.requestLog +} + +// GetResponseLog returns the response log channel +func (h *APIHandler) GetResponseLog() <-chan APIResponse { + return h.responseLog +} + +// FormatRequest formats an API request for display +func FormatRequest(req *APIRequest) string { + return req.Timestamp.Format("15:04:05") + " → " + req.Method + " " + req.URL +} + +// FormatResponse formats an API response for display +func FormatResponse(resp APIResponse) string { + return resp.Timestamp.Format("15:04:05") + " ← " + + string(rune(resp.StatusCode)) + " " + + resp.Duration.String() +} diff --git a/pkg/voice/audio.go b/pkg/voice/audio.go new file mode 100644 index 0000000..6a1ee4e --- /dev/null +++ b/pkg/voice/audio.go @@ -0,0 +1,434 @@ +package voice + +import ( + "fmt" + "sync" + "time" + + "github.com/gordonklaus/portaudio" +) + +const ( + // Audio configuration constants + SampleRate = 48000 + FrameSize = 480 // 10ms at 48kHz + Channels = 1 // Mono + BitsPerSample = 16 +) + +// AudioBuffer represents a circular buffer for audio data +type AudioBuffer struct { + data []float32 + size int + head int + tail int + count int + mutex sync.Mutex +} + +// NewAudioBuffer creates a new audio buffer +func NewAudioBuffer(size int) *AudioBuffer { + return &AudioBuffer{ + data: make([]float32, size), + size: size, + } +} + +// Write writes audio data to the buffer +func (b *AudioBuffer) Write(data []float32) int { + b.mutex.Lock() + defer b.mutex.Unlock() + + written := 0 + for i, sample := range data { + if b.count >= b.size { + // Buffer full, drop oldest sample + b.tail = (b.tail + 1) % b.size + b.count-- + } + + b.data[b.head] = sample + b.head = (b.head + 1) % b.size + b.count++ + written = i + 1 + } + + return written +} + +// Read reads audio data from the buffer +func (b *AudioBuffer) Read(data []float32) int { + b.mutex.Lock() + defer b.mutex.Unlock() + + read := 0 + for i := range data { + if b.count == 0 { + // Buffer empty, fill with silence + data[i] = 0 + } else { + data[i] = b.data[b.tail] + b.tail = (b.tail + 1) % b.size + b.count-- + } + read = i + 1 + } + + return read +} + +// Available returns the number of samples available for reading +func (b *AudioBuffer) Available() int { + b.mutex.Lock() + defer b.mutex.Unlock() + return b.count +} + +// AudioStream manages audio input and output streams +type AudioStream struct { + deviceManager *AudioDeviceManager + config *WebRTCConfig + + // Input stream + inputStream *portaudio.Stream + inputBuffer *AudioBuffer + inputDevice *AudioDevice + + // Output stream + outputStream *portaudio.Stream + outputBuffer *AudioBuffer + outputDevice *AudioDevice + + // Control + running bool + runMutex sync.RWMutex + stopChan chan struct{} + + // Debugging + debugger *AudioDebugger +} + +// NewAudioStream creates a new audio stream +func NewAudioStream(config *WebRTCConfig) (*AudioStream, error) { + deviceManager := NewAudioDeviceManager() + if err := deviceManager.Initialize(); err != nil { + return nil, fmt.Errorf("failed to initialize device manager: %w", err) + } + + // Create debugger if enabled + debugger := NewAudioDebugger(config.AudioDebug) + + // Create audio buffers (1 second of audio data) + bufferSize := SampleRate * 1 + inputBuffer := NewAudioBuffer(bufferSize) + outputBuffer := NewAudioBuffer(bufferSize) + + return &AudioStream{ + deviceManager: deviceManager, + config: config, + inputBuffer: inputBuffer, + outputBuffer: outputBuffer, + stopChan: make(chan struct{}), + debugger: debugger, + }, nil +} + +// Start starts the audio streams +func (a *AudioStream) Start() error { + a.runMutex.Lock() + defer a.runMutex.Unlock() + + if a.running { + return fmt.Errorf("audio stream already running") + } + + // Setup input device + var err error + if a.config.AudioInputDevice == "default" || a.config.AudioInputDevice == "" { + a.inputDevice, err = a.deviceManager.GetDefaultInputDevice() + } else { + a.inputDevice, err = a.deviceManager.FindInputDeviceByName(a.config.AudioInputDevice) + } + if err != nil { + return fmt.Errorf("failed to get input device: %w", err) + } + + // Setup output device + if a.config.AudioOutputDevice == "default" || a.config.AudioOutputDevice == "" { + a.outputDevice, err = a.deviceManager.GetDefaultOutputDevice() + } else { + a.outputDevice, err = a.deviceManager.FindOutputDeviceByName(a.config.AudioOutputDevice) + } + if err != nil { + return fmt.Errorf("failed to get output device: %w", err) + } + + // Start input stream + if err := a.startInputStream(); err != nil { + return fmt.Errorf("failed to start input stream: %w", err) + } + + // Start output stream + if err := a.startOutputStream(); err != nil { + if closeErr := a.inputStream.Close(); closeErr != nil { + fmt.Printf("Failed to close input stream: %v\n", closeErr) + } + return fmt.Errorf("failed to start output stream: %w", err) + } + + // Start debugger if enabled + if err := a.debugger.Start(); err != nil { + fmt.Printf("Failed to start audio debugger: %v\n", err) + } + + a.running = true + return nil +} + +// createStream is a helper function to create audio streams +func (a *AudioStream) createStream(isInput bool, device *AudioDevice, callback interface{}) (*portaudio.Stream, error) { + // Get all devices to find the actual device info + devices, err := portaudio.Devices() + if err != nil { + return nil, fmt.Errorf("failed to get devices: %w", err) + } + + if device.Index >= len(devices) { + return nil, fmt.Errorf("invalid device index: %d", device.Index) + } + + actualDevice := devices[device.Index] + + var params portaudio.StreamParameters + if isInput { + params = portaudio.StreamParameters{ + Input: portaudio.StreamDeviceParameters{ + Device: actualDevice, + Channels: Channels, + Latency: time.Duration(device.DefaultLowInputLatency * float64(time.Second)), + }, + SampleRate: SampleRate, + FramesPerBuffer: FrameSize, + } + } else { + params = portaudio.StreamParameters{ + Output: portaudio.StreamDeviceParameters{ + Device: actualDevice, + Channels: Channels, + Latency: time.Duration(device.DefaultLowOutputLatency * float64(time.Second)), + }, + SampleRate: SampleRate, + FramesPerBuffer: FrameSize, + } + } + + stream, err := portaudio.OpenStream(params, callback) + if err != nil { + return nil, fmt.Errorf("failed to open stream: %w", err) + } + + if err := stream.Start(); err != nil { + if closeErr := stream.Close(); closeErr != nil { + fmt.Printf("Failed to close stream: %v\n", closeErr) + } + return nil, fmt.Errorf("failed to start stream: %w", err) + } + + return stream, nil +} + +// startInputStream starts the audio input stream +func (a *AudioStream) startInputStream() error { + // Create input callback + inputCallback := func(in []float32) { + // Debug input audio + a.debugger.WriteInput(in) + a.debugger.LogAudioStats(in, "Input") + + // Write audio data to input buffer for processing + a.inputBuffer.Write(in) + } + + stream, err := a.createStream(true, a.inputDevice, inputCallback) + if err != nil { + return fmt.Errorf("failed to create input stream: %w", err) + } + + a.inputStream = stream + return nil +} + +// startOutputStream starts the audio output stream +func (a *AudioStream) startOutputStream() error { + // Create output callback + outputCallback := func(out []float32) { + // Read audio data from output buffer + a.outputBuffer.Read(out) + + // Debug output audio + a.debugger.WriteOutput(out) + a.debugger.LogAudioStats(out, "Output") + } + + stream, err := a.createStream(false, a.outputDevice, outputCallback) + if err != nil { + return fmt.Errorf("failed to create output stream: %w", err) + } + + a.outputStream = stream + return nil +} + +// Stop stops the audio streams +func (a *AudioStream) Stop() error { + a.runMutex.Lock() + defer a.runMutex.Unlock() + + if !a.running { + return nil + } + + // Signal stop + close(a.stopChan) + + // Stop and close streams + var inputErr, outputErr error + + if a.inputStream != nil { + inputErr = a.inputStream.Close() + a.inputStream = nil + } + + if a.outputStream != nil { + outputErr = a.outputStream.Close() + a.outputStream = nil + } + + // Stop debugger + if err := a.debugger.Stop(); err != nil { + fmt.Printf("Warning: failed to stop audio debugger: %v\n", err) + } + + // Terminate device manager + if err := a.deviceManager.Terminate(); err != nil { + fmt.Printf("Warning: failed to terminate device manager: %v\n", err) + } + + a.running = false + + // Return first error encountered + if inputErr != nil { + return fmt.Errorf("failed to close input stream: %w", inputErr) + } + if outputErr != nil { + return fmt.Errorf("failed to close output stream: %w", outputErr) + } + + return nil +} + +// WriteAudio writes audio data to the output buffer (for incoming audio) +func (a *AudioStream) WriteAudio(data []float32) int { + return a.outputBuffer.Write(data) +} + +// GetInputLevel returns the current input audio level (0.0 to 1.0) +func (a *AudioStream) GetInputLevel() float32 { + // Get recent audio data from input buffer + samples := make([]float32, FrameSize) + read := a.inputBuffer.Read(samples) + + if read == 0 { + return 0.0 + } + + // Calculate RMS level + var sum float32 + for i := 0; i < read; i++ { + sum += samples[i] * samples[i] + } + + rms := float32(0.0) + if read > 0 { + rms = float32(sum) / float32(read) + if rms > 0 { + rms = float32(0.5) // Simplified RMS calculation + } + } + + // Clamp to [0, 1] + if rms > 1.0 { + rms = 1.0 + } + + return rms +} + +// GetOutputLevel returns the current output audio level (0.0 to 1.0) +func (a *AudioStream) GetOutputLevel() float32 { + // For output level, we can check the buffer fill level as a proxy + available := a.outputBuffer.Available() + bufferSize := a.outputBuffer.size + + if bufferSize == 0 { + return 0.0 + } + + level := float32(available) / float32(bufferSize) + if level > 1.0 { + level = 1.0 + } + + return level +} + +// IsRunning returns true if the audio stream is running +func (a *AudioStream) IsRunning() bool { + a.runMutex.RLock() + defer a.runMutex.RUnlock() + return a.running +} + +// GetBufferState returns detailed buffer state for debugging +func (a *AudioStream) GetBufferState() (inputAvail, outputAvail, inputSize, outputSize int) { + if a.inputBuffer != nil { + inputAvail = a.inputBuffer.Available() + inputSize = a.inputBuffer.size + } + if a.outputBuffer != nil { + outputAvail = a.outputBuffer.Available() + outputSize = a.outputBuffer.size + } + return +} + +// LogBufferState logs current buffer state using the debugger +func (a *AudioStream) LogBufferState() { + if a.debugger == nil { + return + } + + inputAvail, outputAvail, inputSize, outputSize := a.GetBufferState() + a.debugger.LogBufferState(inputAvail, outputAvail, inputSize, outputSize) +} + +// GetInputDevice returns the current input device +func (a *AudioStream) GetInputDevice() *AudioDevice { + return a.inputDevice +} + +// ReadAudio reads audio samples from the input buffer +func (a *AudioStream) ReadAudio(numSamples int) []float32 { + if !a.IsRunning() { + return make([]float32, numSamples) // Return silence if not running + } + + samples := make([]float32, numSamples) + a.inputBuffer.Read(samples) + return samples +} + +// GetOutputDevice returns the current output device +func (a *AudioStream) GetOutputDevice() *AudioDevice { + return a.outputDevice +} diff --git a/pkg/voice/client.go b/pkg/voice/client.go new file mode 100644 index 0000000..afc5f44 --- /dev/null +++ b/pkg/voice/client.go @@ -0,0 +1,755 @@ +package voice + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + vapiclient "github.com/VapiAI/server-sdk-go/client" +) + +// CallStatus represents the current state of a voice call +type CallStatus string + +const ( + CallStatusIdle CallStatus = "idle" + CallStatusConnecting CallStatus = "connecting" + CallStatusConnected CallStatus = "connected" + CallStatusDisconnected CallStatus = "disconnected" + CallStatusFailed CallStatus = "failed" +) + +// CallState holds the current state of a voice call +type CallState struct { + CallID string + AssistantID string + Status CallStatus + StartTime time.Time + WebSocketURL string +} + +// APIRequest represents a request to the Vapi API +type APIRequest struct { + Method string + URL string + Headers map[string]string + Body interface{} + Timestamp time.Time +} + +// APIResponse represents a response from the Vapi API +type APIResponse struct { + StatusCode int + Headers map[string]string + Body interface{} + Duration time.Duration + Timestamp time.Time +} + +// VoiceClient manages voice calls with Vapi WebSocket transport +type VoiceClient struct { + config *WebRTCConfig + vapiClient *vapiclient.Client + callState *CallState + + // Audio pipeline + audioStream *AudioStream + + // WebSocket signaling + signaling *VapiWebSocket + + // Audio processing with jitter buffer + audioProcessor *WebSocketAudioProcessor + jitterBuffer *WebSocketJitterBuffer + + // Echo cancellation state + lastSpeakerSamples []float32 + + // Silence detection + silenceThreshold float32 + consecutiveSilentChunks int + maxSilentChunks int + + // Event channels + requestLog chan APIRequest + responseLog chan APIResponse + callEvents chan CallEvent +} + +// CallEvent represents events during a voice call +type CallEvent struct { + Type string + Data interface{} + Timestamp time.Time +} + +// NewVoiceClient creates a new voice client +func NewVoiceClient(config *WebRTCConfig, vapiClient *vapiclient.Client) (*VoiceClient, error) { + if config == nil { + config = DefaultWebRTCConfig() + } + + // Create audio stream + audioStream, err := NewAudioStream(config) + if err != nil { + return nil, fmt.Errorf("failed to create audio stream: %w", err) + } + + // Create WebSocket signaling client + signaling := NewVapiWebSocket() + + // Create audio processor + audioProcessor, err := NewWebSocketAudioProcessor() + if err != nil { + return nil, fmt.Errorf("failed to create audio processor: %w", err) + } + + // Create WebSocket jitter buffer for incoming audio + jitterBuffer, err := NewWebSocketJitterBuffer(DefaultWebSocketJitterConfig()) + if err != nil { + return nil, fmt.Errorf("failed to create jitter buffer: %w", err) + } + + return &VoiceClient{ + config: config, + vapiClient: vapiClient, + audioStream: audioStream, + signaling: signaling, + audioProcessor: audioProcessor, + jitterBuffer: jitterBuffer, + lastSpeakerSamples: make([]float32, 0), + silenceThreshold: 0.001, // -60dB threshold for silence detection + maxSilentChunks: 3, // Allow max 3 consecutive silent chunks before gating + callState: &CallState{ + Status: CallStatusIdle, + }, + requestLog: make(chan APIRequest, 100), + responseLog: make(chan APIResponse, 100), + callEvents: make(chan CallEvent, 100), + }, nil +} + +// StartCall initiates a voice call with the specified assistant +func (c *VoiceClient) StartCall(assistantID string) error { + c.callState.Status = CallStatusConnecting + c.callState.AssistantID = assistantID + c.callState.StartTime = time.Now() + + // 1. Create WebSocket call via Vapi's /call endpoint with WebSocket transport + call, err := c.createVapiWebSocketCall(assistantID) + if err != nil { + c.callState.Status = CallStatusFailed + return fmt.Errorf("failed to create Vapi WebSocket call: %w", err) + } + + // Update call state from Vapi response + c.callState.CallID = call.Id + c.callState.WebSocketURL = call.RoomURL + + // 2. Connect to Vapi WebSocket transport + if err := c.signaling.Connect(call.RoomURL); err != nil { + c.callState.Status = CallStatusFailed + return fmt.Errorf("failed to connect to WebSocket transport: %w", err) + } + + // Start monitoring signaling events + go c.handleSignalingEvents() + + // 3. Start audio stream + if err := c.audioStream.Start(); err != nil { + c.callState.Status = CallStatusFailed + return fmt.Errorf("failed to start audio stream: %w", err) + } + + // 4. Reset and start audio processing + c.audioProcessor.Reset() + c.consecutiveSilentChunks = 0 // Reset silence detection + + // 5. Start jitter buffer for incoming audio + if err := c.jitterBuffer.Start(); err != nil { + c.callState.Status = CallStatusFailed + return fmt.Errorf("failed to start jitter buffer: %w", err) + } + + // 6. Start streaming microphone audio to WebSocket + go c.streamMicrophoneAudio() + + // 7. Start jitter buffer audio processing + go c.processJitterBufferAudio() + + c.callState.Status = CallStatusConnected + + // Emit call started event + c.callEvents <- CallEvent{ + Type: "call_started", + Data: c.callState, + Timestamp: time.Now(), + } + + return nil +} + +// WebSocketCallRequest represents the request structure for /call endpoint with WebSocket transport +type WebSocketCallRequest struct { + AssistantID string `json:"assistantId"` + Transport struct { + Provider string `json:"provider"` + AudioFormat struct { + Format string `json:"format"` + Container string `json:"container"` + SampleRate int `json:"sampleRate"` + } `json:"audioFormat"` + } `json:"transport"` +} + +// WebSocketCallResponse represents the response from /call endpoint with WebSocket transport +type WebSocketCallResponse struct { + ID string `json:"id"` + Status string `json:"status"` + AssistantID string `json:"assistantId"` + Transport struct { + Provider string `json:"provider"` + WebsocketCallURL string `json:"websocketCallUrl"` // The WebSocket URL for audio transport + } `json:"transport"` + CreatedAt time.Time `json:"createdAt"` +} + +// Call represents a Vapi call for WebSocket transport +type Call struct { + Id string + AssistantID string + Status string + RoomURL string + RoomName string + JoinToken string + ListenURL string // Vapi WebSocket for monitoring + ControlURL string // Vapi control URL +} + +// createVapiWebSocketCall creates a WebSocket call via Vapi's /call endpoint with WebSocket transport +func (c *VoiceClient) createVapiWebSocketCall(assistantID string) (*Call, error) { + // Prepare the request payload for WebSocket transport + payload := WebSocketCallRequest{ + AssistantID: assistantID, + Transport: struct { + Provider string `json:"provider"` + AudioFormat struct { + Format string `json:"format"` + Container string `json:"container"` + SampleRate int `json:"sampleRate"` + } `json:"audioFormat"` + }{ + Provider: "vapi.websocket", + AudioFormat: struct { + Format string `json:"format"` + Container string `json:"container"` + SampleRate int `json:"sampleRate"` + }{ + Format: "pcm_s16le", + Container: "raw", + SampleRate: 16000, // Request 16kHz from Vapi (their default) + }, + }, + } + + // Marshal the request payload + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("failed to marshal WebSocket call request: %w", err) + } + + // Get the API base URL from config + baseURL := c.config.getAPIBaseURL() + url := baseURL + "/call" + + // Use private API key for call creation + privateKey := c.config.getPrivateAPIKey() + + // Create HTTP request + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonPayload)) + if err != nil { + return nil, fmt.Errorf("failed to create WebSocket call request: %w", err) + } + + // Set headers + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+privateKey) + + // Log the API request + requestLog := APIRequest{ + Method: "POST", + URL: url, + Headers: map[string]string{"Authorization": "Bearer " + privateKey[:10] + "...", "Content-Type": "application/json"}, + Body: payload, + Timestamp: time.Now(), + } + select { + case c.requestLog <- requestLog: + default: + // Channel full, drop log + } + + // Make the HTTP request + startTime := time.Now() + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to create WebSocket call: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // Error handling would complicate deferred cleanup + + // Check response status + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated { + // Try to read error response body for more details + var errorBody map[string]interface{} + if err := json.NewDecoder(resp.Body).Decode(&errorBody); err == nil { + // Log error response + responseLog := APIResponse{ + StatusCode: resp.StatusCode, + Headers: make(map[string]string), + Body: errorBody, + Duration: time.Since(startTime), + Timestamp: time.Now(), + } + select { + case c.responseLog <- responseLog: + default: + // Channel full, drop log + } + return nil, fmt.Errorf("WebSocket call creation failed with status %d: %v", resp.StatusCode, errorBody) + } + return nil, fmt.Errorf("WebSocket call creation failed with status: %d", resp.StatusCode) + } + + // Read raw response to see the actual structure + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + if err := resp.Body.Close(); err != nil { + fmt.Printf("Failed to close response body: %v\n", err) + } + + // Log successful response + var responseBody map[string]interface{} + if err := json.Unmarshal(bodyBytes, &responseBody); err != nil { + fmt.Printf("Failed to unmarshal response body: %v\n", err) + } + + responseLog := APIResponse{ + StatusCode: resp.StatusCode, + Headers: make(map[string]string), + Body: responseBody, + Duration: time.Since(startTime), + Timestamp: time.Now(), + } + select { + case c.responseLog <- responseLog: + default: + // Channel full, drop log + } + + // Parse the response + var wsCallResp WebSocketCallResponse + if err := json.Unmarshal(bodyBytes, &wsCallResp); err != nil { + return nil, fmt.Errorf("failed to decode WebSocket call response: %w", err) + } + + // Convert to our internal Call structure + call := &Call{ + Id: wsCallResp.ID, + AssistantID: wsCallResp.AssistantID, + Status: wsCallResp.Status, + RoomURL: wsCallResp.Transport.WebsocketCallURL, // Use WebSocket URL as room URL + RoomName: wsCallResp.ID, // Use call ID as room name + JoinToken: "", // No token needed for WebSocket transport + ListenURL: wsCallResp.Transport.WebsocketCallURL, // WebSocket URL for transport + ControlURL: "", // No separate control URL for WebSocket transport + } + + return call, nil +} + +// endVapiCall sends a DELETE request to Vapi to properly end the call +func (c *VoiceClient) endVapiCall(callID string) error { + // Get the API base URL from config + baseURL := c.config.getAPIBaseURL() + url := baseURL + "/call/" + callID + + // Use private API key for call termination + privateKey := c.config.getPrivateAPIKey() + + // Create DELETE request + req, err := http.NewRequest("DELETE", url, http.NoBody) + if err != nil { + return fmt.Errorf("failed to create end call request: %w", err) + } + + // Set headers + req.Header.Set("Authorization", "Bearer "+privateKey) + + // Log the API request + requestLog := APIRequest{ + Method: "DELETE", + URL: url, + Headers: map[string]string{"Authorization": "Bearer " + privateKey[:10] + "..."}, + Body: nil, + Timestamp: time.Now(), + } + select { + case c.requestLog <- requestLog: + default: + // Channel full, drop log + } + + // Make the HTTP request + startTime := time.Now() + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("failed to send end call request: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // Error handling would complicate deferred cleanup + + // Log response + responseLog := APIResponse{ + StatusCode: resp.StatusCode, + Headers: make(map[string]string), + Body: nil, + Duration: time.Since(startTime), + Timestamp: time.Now(), + } + select { + case c.responseLog <- responseLog: + default: + // Channel full, drop log + } + + // Check response status + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNoContent { + return fmt.Errorf("end call request failed with status: %d", resp.StatusCode) + } + + return nil +} + +// EndCall terminates the current voice call +func (c *VoiceClient) EndCall() error { + if c.callState.Status == CallStatusIdle { + return fmt.Errorf("no active call to end") + } + + // Send DELETE request to Vapi to properly end the call + if c.callState.CallID != "" { + if err := c.endVapiCall(c.callState.CallID); err != nil { + fmt.Printf("Warning: failed to end Vapi call: %v\n", err) + // Continue with local cleanup even if API call fails + } + } + + // Stop jitter buffer + if c.jitterBuffer != nil { + if err := c.jitterBuffer.Stop(); err != nil { + fmt.Printf("Warning: failed to stop jitter buffer: %v\n", err) + } + } + + // Stop audio stream + if c.audioStream != nil { + if err := c.audioStream.Stop(); err != nil { + fmt.Printf("Warning: failed to stop audio stream: %v\n", err) + } + } + + // Close signaling connection + if c.signaling != nil { + if err := c.signaling.Close(); err != nil { + fmt.Printf("Warning: failed to close signaling: %v\n", err) + } + } + + // Reset call state + c.callState.Status = CallStatusIdle + c.callState.WebSocketURL = "" + + // Emit call ended event + c.callEvents <- CallEvent{ + Type: "call_ended", + Data: c.callState, + Timestamp: time.Now(), + } + + return nil +} + +// GetCallState returns the current call state +func (c *VoiceClient) GetCallState() *CallState { + return c.callState +} + +// GetRequestLog returns the API request log channel +func (c *VoiceClient) GetRequestLog() <-chan APIRequest { + return c.requestLog +} + +// GetResponseLog returns the API response log channel +func (c *VoiceClient) GetResponseLog() <-chan APIResponse { + return c.responseLog +} + +// GetCallEvents returns the call events channel +func (c *VoiceClient) GetCallEvents() <-chan CallEvent { + return c.callEvents +} + +// GetAudioLevels returns current input and output audio levels +func (c *VoiceClient) GetAudioLevels() (input, output float32) { + if c.audioStream == nil { + return 0.0, 0.0 + } + + return c.audioStream.GetInputLevel(), c.audioStream.GetOutputLevel() +} + +// IsAudioRunning returns true if audio stream is active +func (c *VoiceClient) IsAudioRunning() bool { + if c.audioStream == nil { + return false + } + + return c.audioStream.IsRunning() +} + +// ResetAudioProcessor resets the audio processor's internal state +func (c *VoiceClient) ResetAudioProcessor() { + if c.audioProcessor != nil { + c.audioProcessor.Reset() + } +} + +// SetNoiseGateThreshold adjusts the noise gate sensitivity +func (c *VoiceClient) SetNoiseGateThreshold(threshold float32) { + if c.audioProcessor != nil { + c.audioProcessor.SetNoiseGateThreshold(threshold) + } +} + +// SetEchoLearningRate adjusts the echo cancellation learning rate +func (c *VoiceClient) SetEchoLearningRate(rate float32) { + if c.audioProcessor != nil { + c.audioProcessor.SetLearningRate(rate) + } +} + +// SetSilenceThreshold adjusts the silence detection threshold +func (c *VoiceClient) SetSilenceThreshold(threshold float32) { + c.silenceThreshold = threshold +} + +// SetMaxSilentChunks adjusts how many consecutive silent chunks to allow before gating +func (c *VoiceClient) SetMaxSilentChunks(maxChunks int) { + c.maxSilentChunks = maxChunks +} + +// GetSilenceStats returns current silence detection statistics +func (c *VoiceClient) GetSilenceStats() (threshold float32, maxChunks, consecutive int) { + return c.silenceThreshold, c.maxSilentChunks, c.consecutiveSilentChunks +} + +// GetJitterBufferStats returns current jitter buffer performance statistics +func (c *VoiceClient) GetJitterBufferStats() map[string]interface{} { + if c.jitterBuffer == nil { + return map[string]interface{}{"error": "jitter buffer not initialized"} + } + return c.jitterBuffer.GetStats() +} + +// handleSignalingEvents processes events from Vapi WebSocket signaling +func (c *VoiceClient) handleSignalingEvents() { + for event := range c.signaling.GetEvents() { + // Skip noisy audio_data events from being logged + if event.Type == "audio_data" { + // Handle audio data directly without forwarding as call event + if samples, ok := event.Data.([]float32); ok { + + // Store speaker samples for echo cancellation + c.lastSpeakerSamples = samples + + // Send samples to jitter buffer for adaptive buffering + if err := c.jitterBuffer.WriteAudio(samples); err != nil { + fmt.Printf("āš ļø Jitter buffer write failed: %v\n", err) + } + } + continue + } + + // Skip excessive logging events + if event.Type == "model-output" || event.Type == "voice-input" { + continue + } + + // Forward other signaling events as call events (for logging) + callEvent := CallEvent{ + Type: "signaling_" + event.Type, + Data: event.Data, + Timestamp: event.Timestamp, + } + + select { + case c.callEvents <- callEvent: + default: + // Channel full, drop event + } + + // Handle specific signaling events + switch event.Type { + case "room_joined": + c.callEvents <- CallEvent{ + Type: "room_connected", + Data: "Successfully connected to Vapi WebSocket transport", + Timestamp: time.Now(), + } + + case "participant_joined": + c.callEvents <- CallEvent{ + Type: "participant_joined", + Data: event.Data, + Timestamp: time.Now(), + } + + case "speech-update": + // Handle speech status updates + c.callEvents <- CallEvent{ + Type: "speech_update", + Data: event.Data, + Timestamp: time.Now(), + } + + case "transcript": + // Handle transcript events + c.callEvents <- CallEvent{ + Type: "transcript", + Data: event.Data, + Timestamp: time.Now(), + } + + case "webrtc_error", "daily_error", "websocket_error": + c.callEvents <- CallEvent{ + Type: "connection_error", + Data: event.Data, + Timestamp: time.Now(), + } + } + } +} + +// streamMicrophoneAudio continuously streams audio from microphone to Vapi WebSocket +func (c *VoiceClient) streamMicrophoneAudio() { + // Buffer for audio samples + // AudioStream uses 48kHz, but Vapi expects 16kHz + const audioStreamSampleRate = 48000 + const vapiSampleRate = 16000 + const chunkDurationMs = 20 + const audioStreamSamplesPerChunk = (audioStreamSampleRate * chunkDurationMs) / 1000 // 960 samples at 48kHz + const vapiSamplesPerChunk = (vapiSampleRate * chunkDurationMs) / 1000 // 320 samples at 16kHz + + audioBuffer := make([]float32, vapiSamplesPerChunk) + chunkCount := 0 + + for c.callState.Status == CallStatusConnected || c.callState.Status == CallStatusConnecting { + chunkCount++ + // Read audio from microphone + if c.audioStream.IsRunning() { + // Get audio samples from input stream at 48kHz + inputSamples := c.audioStream.ReadAudio(audioStreamSamplesPerChunk) + if len(inputSamples) > 0 { + // Downsample from 48kHz to 16kHz (take every 3rd sample) + for i := 0; i < vapiSamplesPerChunk && i*3 < len(inputSamples); i++ { + audioBuffer[i] = inputSamples[i*3] + } + + // Apply audio processing (echo cancellation and noise reduction) + processedAudio := c.audioProcessor.ProcessAudio(audioBuffer, c.lastSpeakerSamples) + + // Send processed audio to Vapi WebSocket + if c.signaling != nil && c.signaling.IsConnected() { + if err := c.signaling.SendAudioData(processedAudio); err != nil { + fmt.Printf("Failed to send audio data: %v\n", err) + } + } + } + } + + // Sleep for chunk duration (20ms) + time.Sleep(time.Duration(chunkDurationMs) * time.Millisecond) + } +} + +// processJitterBufferAudio continuously reads from jitter buffer and writes to audio stream +func (c *VoiceClient) processJitterBufferAudio() { + const chunkDurationMs = 20 + const vapiSampleRate = 16000 + const vapiSamplesPerChunk = (vapiSampleRate * chunkDurationMs) / 1000 // 320 samples at 16kHz + + ticker := time.NewTicker(time.Duration(chunkDurationMs) * time.Millisecond) + defer ticker.Stop() + + chunkCount := 0 + + for c.callState.Status == CallStatusConnected || c.callState.Status == CallStatusConnecting { + select { + case <-ticker.C: + chunkCount++ + + // Read processed audio from jitter buffer (16kHz) + jitterSamples := c.jitterBuffer.ReadAudio(vapiSamplesPerChunk) + + if len(jitterSamples) > 0 { + // Upsample from 16kHz to 48kHz using proper interpolation + upsampled := c.upsample16to48kHz(jitterSamples) + + // Write to audio stream + written := c.audioStream.WriteAudio(upsampled) + if written != len(upsampled) { + fmt.Printf("āš ļø Audio buffer overflow: Tried to write %d samples, only wrote %d\n", + len(upsampled), written) + } + } + + case <-time.After(100 * time.Millisecond): + // Timeout protection - continue if call is still active + if c.callState.Status != CallStatusConnected && c.callState.Status != CallStatusConnecting { + return + } + } + } +} + +// upsample16to48kHz performs proper interpolation from 16kHz to 48kHz +func (c *VoiceClient) upsample16to48kHz(samples []float32) []float32 { + // 3x upsampling with linear interpolation (better than simple repetition) + upsampled := make([]float32, len(samples)*3) + + for i := 0; i < len(samples); i++ { + // Current sample + current := samples[i] + + // Next sample (or repeat last if at end) + var next float32 + if i+1 < len(samples) { + next = samples[i+1] + } else { + next = current + } + + // Linear interpolation + upsampled[i*3] = current + upsampled[i*3+1] = current + (next-current)*0.33 + upsampled[i*3+2] = current + (next-current)*0.67 + } + + return upsampled +} diff --git a/pkg/voice/config.go b/pkg/voice/config.go new file mode 100644 index 0000000..cc64576 --- /dev/null +++ b/pkg/voice/config.go @@ -0,0 +1,69 @@ +package voice + +import ( + "time" +) + +// WebRTCConfig holds configuration for WebRTC functionality +type WebRTCConfig struct { + // Vapi API Configuration + VapiAPIKey string `mapstructure:"vapi_api_key"` // Private API key + VapiPublicAPIKey string `mapstructure:"vapi_public_api_key"` // Public API key for /call/web + VapiBaseURL string `mapstructure:"vapi_base_url"` + + // Daily.co Configuration (legacy - now handled by Vapi) + DailyAPIKey string `mapstructure:"daily_api_key"` + DailyDomain string `mapstructure:"daily_domain"` + + // WebRTC Configuration + STUNServers []string `mapstructure:"stun_servers"` + TURNServers []string `mapstructure:"turn_servers"` + + // Audio Configuration + AudioInputDevice string `mapstructure:"audio_input_device"` + AudioOutputDevice string `mapstructure:"audio_output_device"` + SampleRate int `mapstructure:"sample_rate"` + BufferSize int `mapstructure:"buffer_size"` + + // Call Configuration + CallTimeout time.Duration `mapstructure:"call_timeout"` + VideoEnabled bool `mapstructure:"video_enabled"` + + // Debug Configuration + AudioDebug bool `mapstructure:"audio_debug"` +} + +// DefaultWebRTCConfig returns default WebRTC configuration +func DefaultWebRTCConfig() *WebRTCConfig { + return &WebRTCConfig{ + // Default Vapi API configuration + VapiBaseURL: "https://api.vapi.ai", + + // Default to Vapi's Daily.co subdomain for WebRTC calls (legacy) + DailyDomain: "vapi", + STUNServers: []string{ + "stun:stun.l.google.com:19302", + "stun:stun1.l.google.com:19302", + }, + AudioInputDevice: "default", + AudioOutputDevice: "default", + SampleRate: 48000, + BufferSize: 480, + CallTimeout: 30 * time.Minute, + VideoEnabled: false, // Audio-only by default + AudioDebug: false, + } +} + +// getPrivateAPIKey returns the Vapi private API key for /call endpoint +func (c *WebRTCConfig) getPrivateAPIKey() string { + return c.VapiAPIKey +} + +// getAPIBaseURL returns the Vapi API base URL +func (c *WebRTCConfig) getAPIBaseURL() string { + if c.VapiBaseURL == "" { + return "https://api.vapi.ai" + } + return c.VapiBaseURL +} diff --git a/pkg/voice/debug.go b/pkg/voice/debug.go new file mode 100644 index 0000000..651a600 --- /dev/null +++ b/pkg/voice/debug.go @@ -0,0 +1,403 @@ +package voice + +import ( + "encoding/binary" + "fmt" + "os" + "sync" + "time" +) + +// AudioDebugger handles audio debugging and recording +type AudioDebugger struct { + enabled bool + inputFile *os.File + outputFile *os.File + inputMutex sync.Mutex + outputMutex sync.Mutex + sampleRate int + channels int + bitsPerSample int + + // Timing and flow tracking + lastInputTime time.Time + lastOutputTime time.Time + outputSampleCount int64 + silentChunks int + totalChunks int +} + +// NewAudioDebugger creates a new audio debugger +func NewAudioDebugger(enabled bool) *AudioDebugger { + return &AudioDebugger{ + enabled: enabled, + sampleRate: 48000, // Match your audio pipeline + channels: 1, // Mono + bitsPerSample: 16, + } +} + +// Start initializes debug recording files +func (d *AudioDebugger) Start() error { + if !d.enabled { + return nil + } + + timestamp := time.Now().Format("20060102-150405") + + // Create input debug file + inputPath := fmt.Sprintf("audio_debug_input_%s.wav", timestamp) + // #nosec G304 -- This is intentional file creation for debugging + inputFile, err := os.Create(inputPath) + if err != nil { + return fmt.Errorf("failed to create input debug file: %w", err) + } + d.inputFile = inputFile + + // Create output debug file + outputPath := fmt.Sprintf("audio_debug_output_%s.wav", timestamp) + // #nosec G304 -- This is intentional file creation for debugging + outputFile, err := os.Create(outputPath) + if err != nil { + if err := inputFile.Close(); err != nil { + fmt.Printf("Failed to close input file: %v\n", err) + } + return fmt.Errorf("failed to create output debug file: %w", err) + } + d.outputFile = outputFile + + // Write WAV headers (we'll update the size later) + if err := d.writeWAVHeader(d.inputFile); err != nil { + return fmt.Errorf("failed to write input WAV header: %w", err) + } + if err := d.writeWAVHeader(d.outputFile); err != nil { + return fmt.Errorf("failed to write output WAV header: %w", err) + } + + fmt.Printf("šŸ“ Audio debugging enabled:\n") + fmt.Printf(" Input: %s\n", inputPath) + fmt.Printf(" Output: %s\n", outputPath) + + return nil +} + +// WriteInput writes input audio samples to debug file +func (d *AudioDebugger) WriteInput(samples []float32) { + if !d.enabled || d.inputFile == nil { + return + } + + d.inputMutex.Lock() + defer d.inputMutex.Unlock() + + // Convert float32 to int16 and write + for _, sample := range samples { + // Check for clipping in float domain + if sample > 1.0 || sample < -1.0 { + fmt.Printf("āš ļø Input clipping detected: %.3f\n", sample) + } + + // Clamp to prevent overflow + if sample > 1.0 { + sample = 1.0 + } else if sample < -1.0 { + sample = -1.0 + } + + // Convert to int16 + int16Sample := int16(sample * 32767.0) + if err := binary.Write(d.inputFile, binary.LittleEndian, int16Sample); err != nil { + fmt.Printf("Failed to write input sample: %v\n", err) + } + } +} + +// WriteOutput writes output audio samples to debug file +func (d *AudioDebugger) WriteOutput(samples []float32) { + if !d.enabled || d.outputFile == nil { + return + } + + d.outputMutex.Lock() + defer d.outputMutex.Unlock() + + // Track timing and detect gaps + now := time.Now() + if !d.lastOutputTime.IsZero() { + timeSinceLastOutput := now.Sub(d.lastOutputTime) + expectedInterval := time.Duration(float64(len(samples)) / float64(d.sampleRate) * float64(time.Second)) + + // Detect significant gaps (more than 2x expected interval) + if timeSinceLastOutput > expectedInterval*2 { + fmt.Printf("šŸ”‡ OUTPUT GAP DETECTED: Expected %.2fms, got %.2fms (gap: %.2fms)\n", + float64(expectedInterval.Nanoseconds())/1e6, + float64(timeSinceLastOutput.Nanoseconds())/1e6, + float64((timeSinceLastOutput-expectedInterval).Nanoseconds())/1e6) + } + } + d.lastOutputTime = now + d.outputSampleCount += int64(len(samples)) + + // Check if this chunk is mostly silent + var silentSamples int + for _, sample := range samples { + if sample > -0.001 && sample < 0.001 { // Very quiet threshold + silentSamples++ + } + } + + d.totalChunks++ + if float64(silentSamples)/float64(len(samples)) > 0.95 { + d.silentChunks++ + if d.totalChunks%50 == 0 { // Log every 50 chunks + fmt.Printf("šŸ”‡ Output silence rate: %d/%d chunks (%.1f%%) - Current chunk: %d/%d silent\n", + d.silentChunks, d.totalChunks, + float64(d.silentChunks)/float64(d.totalChunks)*100, + silentSamples, len(samples)) + } + } + + // Convert float32 to int16 and write + for _, sample := range samples { + // Check for clipping in float domain + if sample > 1.0 || sample < -1.0 { + fmt.Printf("āš ļø Output clipping detected: %.3f\n", sample) + } + + // Clamp to prevent overflow + if sample > 1.0 { + sample = 1.0 + } else if sample < -1.0 { + sample = -1.0 + } + + // Convert to int16 + int16Sample := int16(sample * 32767.0) + if err := binary.Write(d.outputFile, binary.LittleEndian, int16Sample); err != nil { + fmt.Printf("Failed to write output sample: %v\n", err) + } + } +} + +// LogAudioStats logs statistics about audio samples +func (d *AudioDebugger) LogAudioStats(samples []float32, source string) { + if !d.enabled || len(samples) == 0 { + return + } + + // Calculate RMS + var sum float64 + var peak float32 + var clippedCount int + + for _, sample := range samples { + sum += float64(sample * sample) + + absSample := sample + if absSample < 0 { + absSample = -absSample + } + + if absSample > peak { + peak = absSample + } + + if sample > 1.0 || sample < -1.0 { + clippedCount++ + } + } + + rms := float32(sum / float64(len(samples))) + + if clippedCount > 0 || peak > 0.95 { + fmt.Printf("šŸ”Š %s Audio Stats: RMS=%.3f, Peak=%.3f, Clipped=%d/%d\n", + source, rms, peak, clippedCount, len(samples)) + } +} + +// Stop closes debug files and updates WAV headers +func (d *AudioDebugger) Stop() error { + if !d.enabled { + return nil + } + + var errs []error + + if d.inputFile != nil { + d.inputMutex.Lock() + if err := d.updateWAVHeader(d.inputFile); err != nil { + errs = append(errs, fmt.Errorf("failed to update input WAV header: %w", err)) + } + if err := d.inputFile.Close(); err != nil { + errs = append(errs, fmt.Errorf("failed to close input file: %w", err)) + } + d.inputMutex.Unlock() + } + + if d.outputFile != nil { + d.outputMutex.Lock() + if err := d.updateWAVHeader(d.outputFile); err != nil { + errs = append(errs, fmt.Errorf("failed to update output WAV header: %w", err)) + } + if err := d.outputFile.Close(); err != nil { + errs = append(errs, fmt.Errorf("failed to close output file: %w", err)) + } + d.outputMutex.Unlock() + } + + if len(errs) > 0 { + return fmt.Errorf("errors during stop: %v", errs) + } + + fmt.Println("šŸ“ Audio debug files saved") + return nil +} + +// writeWAVHeader writes a WAV file header +func (d *AudioDebugger) writeWAVHeader(file *os.File) error { + // WAV header structure + header := []byte{ + 'R', 'I', 'F', 'F', // ChunkID + 0, 0, 0, 0, // ChunkSize (to be filled later) + 'W', 'A', 'V', 'E', // Format + 'f', 'm', 't', ' ', // Subchunk1ID + 16, 0, 0, 0, // Subchunk1Size (16 for PCM) + 1, 0, // AudioFormat (1 = PCM) + byte(d.channels), byte(d.channels >> 8), // NumChannels + byte(d.sampleRate), byte(d.sampleRate >> 8), byte(d.sampleRate >> 16), byte(d.sampleRate >> 24), // SampleRate + 0, 0, 0, 0, // ByteRate (to be calculated) + 0, 0, // BlockAlign (to be calculated) + byte(d.bitsPerSample), byte(d.bitsPerSample >> 8), // BitsPerSample + 'd', 'a', 't', 'a', // Subchunk2ID + 0, 0, 0, 0, // Subchunk2Size (to be filled later) + } + + // Calculate ByteRate and BlockAlign + blockAlign := d.channels * d.bitsPerSample / 8 + byteRate := d.sampleRate * blockAlign + + // Update ByteRate + // #nosec G115 -- byteRate is calculated from safe constants + binary.LittleEndian.PutUint32(header[28:32], uint32(byteRate)) + // Update BlockAlign + // #nosec G115 -- blockAlign is calculated from safe constants + binary.LittleEndian.PutUint16(header[32:34], uint16(blockAlign)) + + _, err := file.Write(header) + return err +} + +// updateWAVHeader updates the WAV header with the correct file size +func (d *AudioDebugger) updateWAVHeader(file *os.File) error { + // Get file size + fileInfo, err := file.Stat() + if err != nil { + return err + } + + fileSize := fileInfo.Size() + + // Update ChunkSize (file size - 8) + if _, err := file.Seek(4, 0); err != nil { + return fmt.Errorf("failed to seek to chunk size position: %w", err) + } + // #nosec G115 -- fileSize is from file stat, safe for WAV header + if err := binary.Write(file, binary.LittleEndian, uint32(fileSize-8)); err != nil { + return fmt.Errorf("failed to write chunk size: %w", err) + } + + // Update Subchunk2Size (file size - 44) + if _, err := file.Seek(40, 0); err != nil { + return fmt.Errorf("failed to seek to subchunk size position: %w", err) + } + // #nosec G115 -- fileSize is from file stat, safe for WAV header + if err := binary.Write(file, binary.LittleEndian, uint32(fileSize-44)); err != nil { + return fmt.Errorf("failed to write subchunk size: %w", err) + } + + return nil +} + +// LogWebSocketAudio logs detailed information about incoming WebSocket audio +func (d *AudioDebugger) LogWebSocketAudio(samples []float32, timestamp time.Time) { + if !d.enabled || len(samples) == 0 { + return + } + + // Check for timing gaps in WebSocket audio + if !d.lastInputTime.IsZero() { + timeSinceLastWS := timestamp.Sub(d.lastInputTime) + expectedInterval := time.Duration(float64(len(samples)) / 16000.0 * float64(time.Second)) // 16kHz from Vapi + + if timeSinceLastWS > expectedInterval*3 { + fmt.Printf("🌐 WEBSOCKET AUDIO GAP: Expected %.2fms, got %.2fms (gap: %.2fms)\n", + float64(expectedInterval.Nanoseconds())/1e6, + float64(timeSinceLastWS.Nanoseconds())/1e6, + float64((timeSinceLastWS-expectedInterval).Nanoseconds())/1e6) + } + } + d.lastInputTime = timestamp + + // Analyze audio content + var silentSamples, clippedSamples int + var peak, rms float32 + for _, sample := range samples { + if sample > -0.001 && sample < 0.001 { + silentSamples++ + } + if sample > 1.0 || sample < -1.0 { + clippedSamples++ + } + + absSample := sample + if absSample < 0 { + absSample = -absSample + } + if absSample > peak { + peak = absSample + } + rms += sample * sample + } + rms /= float32(len(samples)) + + silenceRate := float64(silentSamples) / float64(len(samples)) + + // Log if significant silence or other issues + if silenceRate > 0.9 || clippedSamples > 0 || peak > 0.95 { + fmt.Printf("🌐 WebSocket Audio: %d samples, %.1f%% silent, peak=%.3f, rms=%.3f, clipped=%d\n", + len(samples), silenceRate*100, peak, rms, clippedSamples) + } +} + +// LogBufferState logs the current state of audio buffers +func (d *AudioDebugger) LogBufferState(inputAvailable, outputAvailable, inputSize, outputSize int) { + if !d.enabled { + return + } + + inputFill := float64(inputAvailable) / float64(inputSize) * 100 + outputFill := float64(outputAvailable) / float64(outputSize) * 100 + + // Log if buffers are getting too full or too empty + if inputFill < 10 || inputFill > 90 || outputFill < 10 || outputFill > 90 { + fmt.Printf("šŸ“Š Buffer State: Input %.1f%% (%d/%d), Output %.1f%% (%d/%d)\n", + inputFill, inputAvailable, inputSize, + outputFill, outputAvailable, outputSize) + } + + // Warn about potential underruns + if outputFill < 5 { + fmt.Printf("āš ļø OUTPUT BUFFER UNDERRUN RISK: Only %.1f%% filled (%d/%d samples)\n", + outputFill, outputAvailable, outputSize) + } +} + +// LogAudioFlow provides a comprehensive view of the audio pipeline state +func (d *AudioDebugger) LogAudioFlow(stage string, sampleCount int, timestamp time.Time) { + if !d.enabled { + return + } + + fmt.Printf("šŸŽµ Audio Flow [%s]: %d samples at %s\n", + stage, sampleCount, timestamp.Format("15:04:05.000")) +} diff --git a/pkg/voice/devices.go b/pkg/voice/devices.go new file mode 100644 index 0000000..f2f10a2 --- /dev/null +++ b/pkg/voice/devices.go @@ -0,0 +1,279 @@ +package voice + +import ( + "fmt" + "strings" + + "github.com/gordonklaus/portaudio" +) + +// AudioDevice represents an audio input or output device +type AudioDevice struct { + Index int + Name string + MaxInputChannels int + MaxOutputChannels int + DefaultSampleRate float64 + DefaultLowInputLatency float64 + DefaultLowOutputLatency float64 + IsDefault bool +} + +// AudioDeviceManager manages audio device enumeration and selection +type AudioDeviceManager struct { + inputDevices []AudioDevice + outputDevices []AudioDevice + initialized bool +} + +// NewAudioDeviceManager creates a new audio device manager +func NewAudioDeviceManager() *AudioDeviceManager { + return &AudioDeviceManager{ + inputDevices: make([]AudioDevice, 0), + outputDevices: make([]AudioDevice, 0), + initialized: false, + } +} + +// Initialize initializes PortAudio and enumerates devices +func (m *AudioDeviceManager) Initialize() error { + if m.initialized { + return nil + } + + // Initialize PortAudio + if err := portaudio.Initialize(); err != nil { + return fmt.Errorf("failed to initialize PortAudio: %w", err) + } + + // Enumerate devices + if err := m.enumerateDevices(); err != nil { + if termErr := portaudio.Terminate(); termErr != nil { + fmt.Printf("Failed to terminate portaudio: %v\n", termErr) + } + return fmt.Errorf("failed to enumerate audio devices: %w", err) + } + + m.initialized = true + return nil +} + +// Terminate terminates PortAudio +func (m *AudioDeviceManager) Terminate() error { + if !m.initialized { + return nil + } + + if err := portaudio.Terminate(); err != nil { + return fmt.Errorf("failed to terminate PortAudio: %w", err) + } + + m.initialized = false + return nil +} + +// enumerateDevices discovers all available audio devices +func (m *AudioDeviceManager) enumerateDevices() error { + // Get default devices + defaultInput, err := portaudio.DefaultInputDevice() + if err != nil { + // Default input device might not be available + defaultInput = nil + } + + defaultOutput, err := portaudio.DefaultOutputDevice() + if err != nil { + return fmt.Errorf("failed to get default output device: %w", err) + } + + // Get all devices + devices, err := portaudio.Devices() + if err != nil { + return fmt.Errorf("failed to get audio devices: %w", err) + } + + // Clear existing device lists + m.inputDevices = make([]AudioDevice, 0) + m.outputDevices = make([]AudioDevice, 0) + + // Process each device + for i, device := range devices { + audioDevice := AudioDevice{ + Index: i, + Name: device.Name, + MaxInputChannels: device.MaxInputChannels, + MaxOutputChannels: device.MaxOutputChannels, + DefaultSampleRate: device.DefaultSampleRate, + DefaultLowInputLatency: device.DefaultLowInputLatency.Seconds(), + DefaultLowOutputLatency: device.DefaultLowOutputLatency.Seconds(), + IsDefault: false, + } + + // Check if this is the default input device + if defaultInput != nil && device == defaultInput { + audioDevice.IsDefault = true + } + + // Check if this is the default output device + if device == defaultOutput { + audioDevice.IsDefault = true + } + + // Add to appropriate device list + if device.MaxInputChannels > 0 { + m.inputDevices = append(m.inputDevices, audioDevice) + } + if device.MaxOutputChannels > 0 { + m.outputDevices = append(m.outputDevices, audioDevice) + } + } + + return nil +} + +// GetInputDevices returns all available input devices +func (m *AudioDeviceManager) GetInputDevices() ([]AudioDevice, error) { + if !m.initialized { + if err := m.Initialize(); err != nil { + return nil, err + } + } + return m.inputDevices, nil +} + +// GetOutputDevices returns all available output devices +func (m *AudioDeviceManager) GetOutputDevices() ([]AudioDevice, error) { + if !m.initialized { + if err := m.Initialize(); err != nil { + return nil, err + } + } + return m.outputDevices, nil +} + +// GetDefaultInputDevice returns the default input device +func (m *AudioDeviceManager) GetDefaultInputDevice() (*AudioDevice, error) { + devices, err := m.GetInputDevices() + if err != nil { + return nil, err + } + + for _, device := range devices { + if device.IsDefault { + return &device, nil + } + } + + // If no default found, return the first available input device + if len(devices) > 0 { + return &devices[0], nil + } + + return nil, fmt.Errorf("no input devices available") +} + +// GetDefaultOutputDevice returns the default output device +func (m *AudioDeviceManager) GetDefaultOutputDevice() (*AudioDevice, error) { + devices, err := m.GetOutputDevices() + if err != nil { + return nil, err + } + + for _, device := range devices { + if device.IsDefault { + return &device, nil + } + } + + // If no default found, return the first available output device + if len(devices) > 0 { + return &devices[0], nil + } + + return nil, fmt.Errorf("no output devices available") +} + +// FindInputDeviceByName finds an input device by name (case-insensitive partial match) +func (m *AudioDeviceManager) FindInputDeviceByName(name string) (*AudioDevice, error) { + devices, err := m.GetInputDevices() + if err != nil { + return nil, err + } + + name = strings.ToLower(name) + + // First try exact match + for _, device := range devices { + if strings.EqualFold(device.Name, name) { + return &device, nil + } + } + + // Then try partial match + for _, device := range devices { + if strings.Contains(strings.ToLower(device.Name), name) { + return &device, nil + } + } + + return nil, fmt.Errorf("input device not found: %s", name) +} + +// FindOutputDeviceByName finds an output device by name (case-insensitive partial match) +func (m *AudioDeviceManager) FindOutputDeviceByName(name string) (*AudioDevice, error) { + devices, err := m.GetOutputDevices() + if err != nil { + return nil, err + } + + name = strings.ToLower(name) + + // First try exact match + for _, device := range devices { + if strings.EqualFold(device.Name, name) { + return &device, nil + } + } + + // Then try partial match + for _, device := range devices { + if strings.Contains(strings.ToLower(device.Name), name) { + return &device, nil + } + } + + return nil, fmt.Errorf("output device not found: %s", name) +} + +// ListDevices returns a formatted string listing all audio devices +func (m *AudioDeviceManager) ListDevices() (string, error) { + if !m.initialized { + if err := m.Initialize(); err != nil { + return "", err + } + } + + var result strings.Builder + + result.WriteString("šŸŽ¤ Input Devices:\n") + for _, device := range m.inputDevices { + defaultStr := "" + if device.IsDefault { + defaultStr = " (default)" + } + result.WriteString(fmt.Sprintf(" [%d] %s%s - %d channels, %.0f Hz\n", + device.Index, device.Name, defaultStr, device.MaxInputChannels, device.DefaultSampleRate)) + } + + result.WriteString("\nšŸ”Š Output Devices:\n") + for _, device := range m.outputDevices { + defaultStr := "" + if device.IsDefault { + defaultStr = " (default)" + } + result.WriteString(fmt.Sprintf(" [%d] %s%s - %d channels, %.0f Hz\n", + device.Index, device.Name, defaultStr, device.MaxOutputChannels, device.DefaultSampleRate)) + } + + return result.String(), nil +} diff --git a/pkg/voice/processor.go b/pkg/voice/processor.go new file mode 100644 index 0000000..286955f --- /dev/null +++ b/pkg/voice/processor.go @@ -0,0 +1,275 @@ +package voice + +// SimpleAudioProcessor provides basic audio processing algorithms +// This is a simplified version while we work on WebRTC integration +type WebRTCAudioProcessor struct { + enabled bool + sampleRate int + channels int + frameSize int + // Simple echo cancellation state + echoBuffer []float32 + adaptiveFilter []float32 + filterLength int + // Noise gate parameters + noiseGateThreshold float32 + noiseGateRatio float32 + // AGC (Automatic Gain Control) state + targetLevel float32 + currentGain float32 + agcEnabled bool +} + +// NewWebRTCAudioProcessor creates a new audio processor with basic algorithms +func NewWebRTCAudioProcessor(sampleRate, channels, frameSize int) (*WebRTCAudioProcessor, error) { + filterLength := 256 // Adaptive filter length for echo cancellation + processor := &WebRTCAudioProcessor{ + enabled: true, + sampleRate: sampleRate, + channels: channels, + frameSize: frameSize, + echoBuffer: make([]float32, filterLength), + adaptiveFilter: make([]float32, filterLength), + filterLength: filterLength, + noiseGateThreshold: 0.01, // -40dB + noiseGateRatio: 0.1, // 10:1 ratio + targetLevel: 0.5, // Target -6dB + currentGain: 1.0, + agcEnabled: true, + } + return processor, nil +} + +// ProcessMicrophoneAudio processes microphone input with basic audio processing +func (p *WebRTCAudioProcessor) ProcessMicrophoneAudio(micInput, speakerOutput []float32) []float32 { + if !p.enabled || len(micInput) == 0 { + return micInput + } + // Make a copy to avoid modifying original + processed := make([]float32, len(micInput)) + copy(processed, micInput) + // 1. Simple echo cancellation + if len(speakerOutput) > 0 { + processed = p.simpleEchoCancellation(processed, speakerOutput) + } + // 2. Noise gate + processed = p.noiseGate(processed) + // 3. Automatic Gain Control + if p.agcEnabled { + processed = p.automaticGainControl(processed) + } + return processed +} + +// ProcessSpeakerAudio processes speaker output (can add additional processing if needed) +func (p *WebRTCAudioProcessor) ProcessSpeakerAudio(input []float32) []float32 { + if !p.enabled { + return input + } + // For now, just pass through - could add gain control, EQ, etc. + return input +} + +// SetEnabled enables or disables processing +func (p *WebRTCAudioProcessor) SetEnabled(enabled bool) { + p.enabled = enabled +} + +// Close cleans up the processor +func (p *WebRTCAudioProcessor) Close() error { + // Reset buffers + p.echoBuffer = nil + p.adaptiveFilter = nil + return nil +} + +// simpleEchoCancellation performs basic echo cancellation using adaptive filtering +func (p *WebRTCAudioProcessor) simpleEchoCancellation(micInput, speakerOutput []float32) []float32 { + if len(speakerOutput) == 0 { + return micInput + } + processed := make([]float32, len(micInput)) + for i, micSample := range micInput { + // Simple subtraction-based echo cancellation + // This is very basic - real echo cancellation is much more complex + var echo float32 + if i < len(speakerOutput) { + // Apply a simple delay and attenuation + echo = speakerOutput[i] * 0.3 // 30% echo assumption + } + // Subtract estimated echo + processed[i] = micSample - echo + // Prevent over-cancellation + if processed[i] > 1.0 { + processed[i] = 1.0 + } else if processed[i] < -1.0 { + processed[i] = -1.0 + } + } + return processed +} + +// noiseGate applies noise gating to reduce background noise +func (p *WebRTCAudioProcessor) noiseGate(input []float32) []float32 { + processed := make([]float32, len(input)) + for i, sample := range input { + amplitude := sample + if amplitude < 0 { + amplitude = -amplitude + } + if amplitude < p.noiseGateThreshold { + // Below threshold - apply ratio + processed[i] = sample * p.noiseGateRatio + } else { + // Above threshold - pass through + processed[i] = sample + } + } + return processed +} + +// automaticGainControl maintains consistent audio levels +func (p *WebRTCAudioProcessor) automaticGainControl(input []float32) []float32 { + if len(input) == 0 { + return input + } + // Calculate RMS of current frame + var sum float32 + for _, sample := range input { + sum += sample * sample + } + rms := float32(0.0) + if len(input) > 0 { + rms = sum / float32(len(input)) + if rms > 0 { + rms = float32(0.707) * rms // Approximate RMS + } + } + // Adjust gain towards target level + if rms > 0.001 { // Avoid division by zero + targetGain := p.targetLevel / rms + // Smooth gain changes to avoid artifacts + alpha := float32(0.1) // Smoothing factor + p.currentGain = alpha*targetGain + (1-alpha)*p.currentGain + // Limit gain to reasonable range + if p.currentGain > 10.0 { + p.currentGain = 10.0 + } else if p.currentGain < 0.1 { + p.currentGain = 0.1 + } + } + // Apply gain + processed := make([]float32, len(input)) + for i, sample := range input { + processed[i] = sample * p.currentGain + // Prevent clipping + if processed[i] > 1.0 { + processed[i] = 1.0 + } else if processed[i] < -1.0 { + processed[i] = -1.0 + } + } + return processed +} + +// GetStats returns processing statistics +func (p *WebRTCAudioProcessor) GetStats() map[string]interface{} { + return map[string]interface{}{ + "enabled": p.enabled, + "sample_rate": p.sampleRate, + "channels": p.channels, + "frame_size": p.frameSize, + } +} + +// WebRTCResampler provides high-quality resampling using WebRTC algorithms +type WebRTCResampler struct { + inputRate int + outputRate int + channels int +} + +// NewWebRTCResampler creates a new WebRTC-based resampler +func NewWebRTCResampler(inputRate, outputRate, channels int) (*WebRTCResampler, error) { + return &WebRTCResampler{ + inputRate: inputRate, + outputRate: outputRate, + channels: channels, + }, nil +} + +// Resample performs high-quality resampling using WebRTC algorithms +func (r *WebRTCResampler) Resample(input []float32) ([]float32, error) { + if r.inputRate == r.outputRate { + return input, nil + } + // For now, fall back to our improved linear interpolation + // TODO: Integrate with WebRTC's actual resampling when available + ratio := float64(r.outputRate) / float64(r.inputRate) + if ratio > 1.0 { + // Upsampling + return r.upsample(input, ratio), nil + } else { + // Downsampling + return r.downsample(input, ratio), nil + } +} + +// upsample performs high-quality upsampling +func (r *WebRTCResampler) upsample(input []float32, ratio float64) []float32 { + outputLen := int(float64(len(input)) * ratio) + output := make([]float32, outputLen) + for i := 0; i < outputLen; i++ { + srcPos := float64(i) / ratio + srcIndex := int(srcPos) + frac := float32(srcPos - float64(srcIndex)) + if srcIndex >= len(input)-1 { + output[i] = input[len(input)-1] + } else { + // Linear interpolation with smoother transition + sample1 := input[srcIndex] + sample2 := input[srcIndex+1] + // Use cosine interpolation for smoother results + frac2 := (1 - float32(0.5*(1+0.5)*float64(frac))) * frac + output[i] = sample1*(1-frac2) + sample2*frac2 + } + } + return output +} + +// downsample performs anti-aliased downsampling +func (r *WebRTCResampler) downsample(input []float32, ratio float64) []float32 { + // Apply anti-aliasing filter first + filtered := r.antiAliasFilter(input, ratio) + outputLen := int(float64(len(filtered)) * ratio) + output := make([]float32, outputLen) + for i := 0; i < outputLen; i++ { + srcPos := float64(i) / ratio + srcIndex := int(srcPos + 0.5) // Round to nearest + if srcIndex >= len(filtered) { + srcIndex = len(filtered) - 1 + } + output[i] = filtered[srcIndex] + } + return output +} + +// antiAliasFilter applies a simple anti-aliasing filter before downsampling +func (r *WebRTCResampler) antiAliasFilter(input []float32, ratio float64) []float32 { + if len(input) < 3 || ratio >= 1.0 { + return input + } + output := make([]float32, len(input)) + // Simple 3-tap moving average filter + output[0] = input[0] + for i := 1; i < len(input)-1; i++ { + output[i] = 0.25*input[i-1] + 0.5*input[i] + 0.25*input[i+1] + } + output[len(output)-1] = input[len(input)-1] + return output +} + +// Close cleans up the resampler +func (r *WebRTCResampler) Close() error { + return nil +} diff --git a/pkg/voice/signaling.go b/pkg/voice/signaling.go new file mode 100644 index 0000000..5de9735 --- /dev/null +++ b/pkg/voice/signaling.go @@ -0,0 +1,274 @@ +package voice + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "sync" + "time" + + "github.com/gorilla/websocket" +) + +// VapiWebSocket handles WebSocket communication with Vapi transport +type VapiWebSocket struct { + conn *websocket.Conn + wsURL string + events chan SignalingEvent + + // Control + connected bool + mutex sync.RWMutex + done chan struct{} +} + +// SignalingEvent represents a signaling event +type SignalingEvent struct { + Type string `json:"type"` + Data interface{} `json:"data"` + From string `json:"from,omitempty"` + Timestamp time.Time `json:"timestamp"` +} + +// WebSocket message types for Vapi transport +const ( + MSG_ROOM_JOINED = "room-joined" + MSG_ERROR = "error" +) + +// NewVapiWebSocket creates a new Vapi WebSocket client +func NewVapiWebSocket() *VapiWebSocket { + return &VapiWebSocket{ + events: make(chan SignalingEvent, 100), + done: make(chan struct{}), + } +} + +// Connect connects to Vapi WebSocket transport +func (s *VapiWebSocket) Connect(wsURL string) error { + s.mutex.Lock() + defer s.mutex.Unlock() + + if s.connected { + return fmt.Errorf("already connected to WebSocket transport") + } + + if wsURL == "" { + return fmt.Errorf("WebSocket URL is required") + } + + s.wsURL = wsURL + + dialer := websocket.DefaultDialer + dialer.HandshakeTimeout = 10 * time.Second + + // Add authentication headers for Vapi WebSocket + headers := http.Header{} + + fmt.Printf("šŸ• Starting WebSocket handshake (timeout: 10s)...\n") + conn, resp, err := dialer.Dial(wsURL, headers) + if err != nil { + if resp != nil { + defer resp.Body.Close() //nolint:errcheck // Error handling would complicate deferred cleanup + if body, readErr := io.ReadAll(resp.Body); readErr == nil { + return fmt.Errorf("WebSocket handshake failed (status %d): %s", resp.StatusCode, string(body)) + } + } + return fmt.Errorf("failed to connect to Vapi WebSocket: %w", err) + } + + s.conn = conn + s.connected = true + + fmt.Printf("āœ… WebSocket connection established successfully\n") + + // Start message handling for Vapi transport events + go s.handleMessages() + + return nil +} + +// handleMessages processes incoming WebSocket messages +func (s *VapiWebSocket) handleMessages() { + defer func() { + if r := recover(); r != nil { //nolint:staticcheck // Empty branch is intentional for panic recovery + // Panic recovery - websocket connection was closed + // Intentionally empty - we handle cleanup below + } + s.mutex.Lock() + s.connected = false + if s.conn != nil { + if err := s.conn.Close(); err != nil { + fmt.Printf("Failed to close WebSocket connection: %v\n", err) + } + } + s.mutex.Unlock() + }() + + for { + select { + case <-s.done: + return + default: + // Read message from WebSocket (blocking) + messageType, data, err := s.conn.ReadMessage() + if err != nil { + if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { + // Normal closure + return + } + + // Send error event + s.events <- SignalingEvent{ + Type: "websocket_error", + Data: err.Error(), + Timestamp: time.Now(), + } + return + } + + switch messageType { + case websocket.TextMessage: + s.handleTextMessage(data) + case websocket.BinaryMessage: + s.handleBinaryMessage(data) + } + } + } +} + +// handleTextMessage processes JSON control messages from Vapi WebSocket transport +func (s *VapiWebSocket) handleTextMessage(data []byte) { + var message map[string]interface{} + if err := json.Unmarshal(data, &message); err != nil { + s.events <- SignalingEvent{ + Type: "parse_error", + Data: string(data), + Timestamp: time.Now(), + } + return + } + + // Vapi WebSocket transport messages + // Common types: speech-update, transcript, function-call, hang, etc. + msgType := "vapi_transport_event" + if eventType, ok := message["type"].(string); ok { + msgType = eventType + } + + // Create signaling event for Vapi transport + event := SignalingEvent{ + Type: msgType, + Data: message, + Timestamp: time.Now(), + } + + // Send event to listeners + select { + case s.events <- event: + default: + // Channel full, drop event + } +} + +// handleBinaryMessage processes binary audio data from Vapi WebSocket transport +func (s *VapiWebSocket) handleBinaryMessage(data []byte) { + // Binary data is PCM audio from the assistant + // Convert to float32 samples for audio playback + if len(data)%2 != 0 { + return + } + + // Convert PCM 16-bit little-endian to float32 samples + samples := make([]float32, len(data)/2) + for i := 0; i < len(samples); i++ { + // Read 16-bit little-endian sample correctly + low := uint16(data[i*2]) + high := uint16(data[i*2+1]) + // Use proper bit manipulation to avoid overflow + sample := int16(low) | (int16(high) << 8) //nolint:gosec // Safe conversion for audio data + // Convert to float32 (-1.0 to 1.0) with proper scaling + samples[i] = float32(sample) / 32767.0 + } + + // Send audio samples to output stream via event + s.events <- SignalingEvent{ + Type: "audio_data", + Data: samples, + Timestamp: time.Now(), + } +} + +// SendAudioData sends binary audio data to Vapi WebSocket transport +func (s *VapiWebSocket) SendAudioData(samples []float32) error { + s.mutex.RLock() + conn := s.conn + connected := s.connected + s.mutex.RUnlock() + + if !connected || conn == nil { + return fmt.Errorf("not connected to WebSocket transport") + } + + // Convert float32 samples to PCM 16-bit little-endian + data := make([]byte, len(samples)*2) + for i, sample := range samples { + // Clamp to [-1.0, 1.0] and convert to int16 + if sample > 1.0 { + sample = 1.0 + } else if sample < -1.0 { + sample = -1.0 + } + + pcmSample := int16(sample * 32767.0) + + // Write as little-endian + data[i*2] = byte(pcmSample & 0xFF) + data[i*2+1] = byte((pcmSample >> 8) & 0xFF) + } + + return conn.WriteMessage(websocket.BinaryMessage, data) +} + +// GetEvents returns the events channel +func (s *VapiWebSocket) GetEvents() <-chan SignalingEvent { + return s.events +} + +// IsConnected returns true if connected to the signaling server +func (s *VapiWebSocket) IsConnected() bool { + s.mutex.RLock() + defer s.mutex.RUnlock() + return s.connected +} + +// Close closes the signaling connection +func (s *VapiWebSocket) Close() error { + s.mutex.Lock() + defer s.mutex.Unlock() + + if !s.connected { + return nil + } + + // Set connected to false first to stop message reading + s.connected = false + + // Close connection immediately to interrupt any blocking reads + var err error + if s.conn != nil { + err = s.conn.Close() + s.conn = nil + } + + // Signal shutdown to handleMessage goroutine + select { + case <-s.done: + // already closed + default: + close(s.done) + } + + return err +} diff --git a/pkg/voice/terminal.go b/pkg/voice/terminal.go new file mode 100644 index 0000000..76509c1 --- /dev/null +++ b/pkg/voice/terminal.go @@ -0,0 +1,291 @@ +package voice + +import ( + "bufio" + "fmt" + "os" + "os/signal" + "syscall" + "time" + + "github.com/charmbracelet/lipgloss" + "golang.org/x/term" +) + +// TerminalUI manages the terminal interface for voice calls +type TerminalUI struct { + client *VoiceClient + done chan bool + keyEvents chan rune + uiUpdates chan UIUpdate + callEvents chan CallEvent + + // Styles + successStyle lipgloss.Style + errorStyle lipgloss.Style + infoStyle lipgloss.Style + headerStyle lipgloss.Style + + // Terminal state + origTermState *term.State + rawModeEnabled bool + stdinFD int +} + +// UIUpdate represents a terminal UI update +type UIUpdate struct { + Type string + Data interface{} +} + +// NewTerminalUI creates a new terminal UI manager +func NewTerminalUI(client *VoiceClient) *TerminalUI { + return &TerminalUI{ + client: client, + done: make(chan bool), + keyEvents: make(chan rune), + uiUpdates: make(chan UIUpdate), + callEvents: make(chan CallEvent), + + // Initialize styles + successStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("#00FF00")).Bold(true), + errorStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("#FF0000")).Bold(true), + infoStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("#00BFFF")), + headerStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("#FFFF00")).Bold(true), + } +} + +// Run starts the terminal UI +func (ui *TerminalUI) Run() error { + // Display initial header + ui.displayHeader() + + // Set up signal handling for graceful shutdown + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt, syscall.SIGTERM) + + // Start event monitoring goroutines + go ui.monitorCallEvents() + go ui.handleKeyboardInput() + + // Main event loop + for { + select { + case <-c: + // Interrupt signal received + fmt.Println(ui.infoStyle.Render("\nShutting down...")) + return ui.shutdown() + + case event := <-ui.callEvents: + ui.handleCallEvent(event) + + case update := <-ui.uiUpdates: + ui.handleUIUpdate(update) + + case <-ui.done: + return nil + } + } +} + +// displayHeader shows the initial UI header +func (ui *TerminalUI) displayHeader() { + fmt.Println(ui.headerStyle.Render("šŸš€ Vapi Voice Call")) + fmt.Println() + fmt.Println(ui.infoStyle.Render("Starting voice call...")) + fmt.Println(ui.infoStyle.Render("Press Ctrl+C to end the call")) + fmt.Println(ui.infoStyle.Render("Controls: [s] Status [q] End call [h] Help")) + fmt.Println() +} + +// monitorCallEvents monitors call events from the voice client +func (ui *TerminalUI) monitorCallEvents() { + for event := range ui.client.GetCallEvents() { + ui.callEvents <- event + } +} + +// handleCallEvent processes call events +func (ui *TerminalUI) handleCallEvent(event CallEvent) { + timestamp := event.Timestamp.Format("15:04:05") + + switch event.Type { + case "call_started": + fmt.Printf("[%s] %s Call started successfully\n", + timestamp, ui.successStyle.Render("āœ“")) + ui.displayCallStatus() + + case "call_ended": + fmt.Printf("[%s] %s Call ended\n", + timestamp, ui.infoStyle.Render("•")) + ui.done <- true + + case "ice_connection_state_change": + state := event.Data.(string) + fmt.Printf("[%s] %s Connection state: %s\n", + timestamp, ui.infoStyle.Render("•"), state) + + case "ice_candidate": + fmt.Printf("[%s] %s Connection negotiation\n", + timestamp, ui.infoStyle.Render("•")) + + case "offer_sent": + fmt.Printf("[%s] %s Audio connection established\n", + timestamp, ui.infoStyle.Render("•")) + + case "room_connected": + fmt.Printf("[%s] %s Connected to Vapi WebSocket transport\n", + timestamp, ui.successStyle.Render("āœ“")) + + case "participant_joined": + fmt.Printf("[%s] %s Participant joined call\n", + timestamp, ui.successStyle.Render("āœ“")) + + case "connection_error": + fmt.Printf("[%s] %s Connection error: %v\n", + timestamp, ui.errorStyle.Render("āœ—"), event.Data) + + case "signaling_room_joined": + fmt.Printf("[%s] %s Vapi WebSocket connected\n", + timestamp, ui.successStyle.Render("āœ“")) + + default: + // Show all events for debugging + if event.Type != "" { + fmt.Printf("[%s] %s %s\n", + timestamp, ui.infoStyle.Render("•"), event.Type) + } + } +} + +// handleUIUpdate processes UI updates +func (ui *TerminalUI) handleUIUpdate(update UIUpdate) { + switch update.Type { + case "status_update": + ui.displayCallStatus() + case "error": + fmt.Printf("%s %v\n", ui.errorStyle.Render("āœ—"), update.Data) + } +} + +// displayCallStatus shows current call status +func (ui *TerminalUI) displayCallStatus() { + state := ui.client.GetCallState() + + fmt.Println(ui.headerStyle.Render("šŸ“ž Call Status")) + fmt.Printf(" Call ID: %s\n", state.CallID) + fmt.Printf(" Assistant: %s\n", state.AssistantID) + fmt.Printf(" Status: %s\n", ui.formatStatus(state.Status)) + fmt.Printf(" Duration: %s\n", ui.formatDuration(state.StartTime)) + + if state.WebSocketURL != "" { + fmt.Printf(" Room: %s\n", state.CallID) + fmt.Printf(" WebSocket URL: %s\n", state.WebSocketURL) + } + + // Display audio status + if ui.client.IsAudioRunning() { + fmt.Printf(" Audio: %s\n", ui.successStyle.Render("Active")) + } else { + fmt.Printf(" Audio: %s\n", ui.errorStyle.Render("Inactive")) + } + + fmt.Println() +} + +// formatStatus formats call status with appropriate colors +func (ui *TerminalUI) formatStatus(status CallStatus) string { + switch status { + case CallStatusConnected: + return ui.successStyle.Render(string(status)) + case CallStatusFailed, CallStatusDisconnected: + return ui.errorStyle.Render(string(status)) + case CallStatusIdle, CallStatusConnecting: + return ui.infoStyle.Render(string(status)) + default: + return ui.infoStyle.Render(string(status)) + } +} + +// formatDuration formats call duration +func (ui *TerminalUI) formatDuration(startTime time.Time) string { + if startTime.IsZero() { + return "00:00:00" + } + + duration := time.Since(startTime) + hours := int(duration.Hours()) + minutes := int(duration.Minutes()) % 60 + seconds := int(duration.Seconds()) % 60 + + return fmt.Sprintf("%02d:%02d:%02d", hours, minutes, seconds) +} + +// handleKeyboardInput handles keyboard input (placeholder for future interactive features) +func (ui *TerminalUI) handleKeyboardInput() { + fd := int(os.Stdin.Fd()) + ui.stdinFD = fd + + if term.IsTerminal(fd) { + if oldState, err := term.MakeRaw(fd); err == nil { + ui.origTermState = oldState + ui.rawModeEnabled = true + } + } + + // Ensure terminal is restored when this goroutine exits + defer func() { + if ui.rawModeEnabled && ui.origTermState != nil { + _ = term.Restore(ui.stdinFD, ui.origTermState) + ui.rawModeEnabled = false + } + }() + + reader := bufio.NewReader(os.Stdin) + for { + b, err := reader.ReadByte() + if err != nil { + return + } + switch b { + case 'q', 'Q': + fmt.Println(ui.infoStyle.Render("\nEnding call (q pressed)...")) + _ = ui.shutdown() + return + case 's', 'S': + // Trigger a status update in the UI loop + ui.uiUpdates <- UIUpdate{Type: "status_update"} + case 'h', 'H': + fmt.Println(ui.infoStyle.Render("Controls: [s] Status [q] End call [h] Help")) + default: + // ignore other keys + } + } +} + +// shutdown gracefully shuts down the terminal UI +func (ui *TerminalUI) shutdown() error { + // Restore terminal if we enabled raw mode + if ui.rawModeEnabled && ui.origTermState != nil { + _ = term.Restore(ui.stdinFD, ui.origTermState) + ui.rawModeEnabled = false + } + fmt.Println(ui.infoStyle.Render("Ending voice call...")) + + // End the call if still active + if ui.client.GetCallState().Status == CallStatusConnected { + if err := ui.client.EndCall(); err != nil { + fmt.Printf("%s Failed to end call: %v\n", ui.errorStyle.Render("āœ—"), err) + // Don't return error, continue with shutdown + } + } + + // Give a brief moment for cleanup to complete + time.Sleep(200 * time.Millisecond) + + fmt.Println(ui.successStyle.Render("āœ“ Voice call ended successfully")) + + // Force exit the process + os.Exit(0) + return nil // This line will never be reached, but Go requires it +} diff --git a/pkg/voice/webrtc_processor.go b/pkg/voice/webrtc_processor.go new file mode 100644 index 0000000..7143868 --- /dev/null +++ b/pkg/voice/webrtc_processor.go @@ -0,0 +1,155 @@ +package voice + +import ( + "fmt" + "math" + + "github.com/gorilla/websocket" +) + +// WebSocketAudioProcessor handles audio processing with basic echo cancellation +type WebSocketAudioProcessor struct { + // Echo cancellation state + echoBuffer []float32 + echoBufferSize int + adaptiveFilter []float32 + learningRate float32 + // Noise gate parameters + noiseGateThreshold float32 + gateRatio float32 +} + +// AudioPacket represents the structure for WebSocket audio data +type AudioPacket struct { + MicSamples []float32 `json:"micSamples"` + SpeakerSamples []float32 `json:"speakerSamples,omitempty"` + Timestamp int64 `json:"timestamp"` +} + +// NewWebSocketAudioProcessor creates a new audio processor with basic echo cancellation +func NewWebSocketAudioProcessor() (*WebSocketAudioProcessor, error) { + const echoBufferSizeMs = 200 // 200ms echo buffer + const sampleRate = 16000 // 16kHz sample rate (Vapi's format) + echoBufferSize := (sampleRate * echoBufferSizeMs) / 1000 + return &WebSocketAudioProcessor{ + echoBuffer: make([]float32, echoBufferSize), + echoBufferSize: echoBufferSize, + adaptiveFilter: make([]float32, 128), // 128-tap adaptive filter + learningRate: 0.01, + noiseGateThreshold: 0.01, // -40dB noise gate + gateRatio: 0.1, // 10:1 ratio + }, nil +} + +// ProcessAudio applies basic echo cancellation and noise reduction +func (wap *WebSocketAudioProcessor) ProcessAudio(micInput, speakerOutput []float32) []float32 { + if len(micInput) == 0 { + return micInput + } + processed := make([]float32, len(micInput)) + copy(processed, micInput) + // Apply basic echo cancellation if we have speaker output + if len(speakerOutput) > 0 { + processed = wap.applyEchoCancellation(processed, speakerOutput) + } + // Apply noise gate + processed = wap.applyNoiseGate(processed) + return processed +} + +// applyEchoCancellation implements a basic adaptive echo cancellation algorithm +func (wap *WebSocketAudioProcessor) applyEchoCancellation(micInput, speakerOutput []float32) []float32 { + result := make([]float32, len(micInput)) + for i, sample := range micInput { + // Store speaker output in echo buffer (circular buffer) + if len(speakerOutput) > i { + bufferIdx := (i) % wap.echoBufferSize + wap.echoBuffer[bufferIdx] = speakerOutput[i] + } + // Estimate echo using adaptive filter + var echoEstimate float32 + filterLen := len(wap.adaptiveFilter) + for j := 0; j < filterLen && j < wap.echoBufferSize; j++ { + bufferIdx := (i - j + wap.echoBufferSize) % wap.echoBufferSize + echoEstimate += wap.adaptiveFilter[j] * wap.echoBuffer[bufferIdx] + } + // Subtract estimated echo from microphone input + result[i] = sample - echoEstimate + // Update adaptive filter using LMS algorithm + errorSignal := result[i] + for j := 0; j < filterLen && j < wap.echoBufferSize; j++ { + bufferIdx := (i - j + wap.echoBufferSize) % wap.echoBufferSize + wap.adaptiveFilter[j] += wap.learningRate * errorSignal * wap.echoBuffer[bufferIdx] + } + } + return result +} + +// applyNoiseGate applies a simple noise gate to reduce background noise +func (wap *WebSocketAudioProcessor) applyNoiseGate(input []float32) []float32 { + result := make([]float32, len(input)) + for i, sample := range input { + amplitude := float32(math.Abs(float64(sample))) + if amplitude > wap.noiseGateThreshold { + // Above threshold - pass through + result[i] = sample + } else { + // Below threshold - apply gate ratio + result[i] = sample * wap.gateRatio + } + } + return result +} + +// HandleWebSocket processes WebSocket connections with audio processing +func (wap *WebSocketAudioProcessor) HandleWebSocket(ws *websocket.Conn) error { + defer func() { + if err := ws.Close(); err != nil { + fmt.Printf("Failed to close websocket: %v\n", err) + } + }() + for { + // Read audio data from WebSocket + var audioData AudioPacket + err := ws.ReadJSON(&audioData) + if err != nil { + if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { + return fmt.Errorf("websocket read error: %w", err) + } + break + } + // Process audio with echo cancellation and noise reduction + processed := wap.ProcessAudio(audioData.MicSamples, audioData.SpeakerSamples) + // Send processed audio back via WebSocket + response := AudioPacket{ + MicSamples: processed, + Timestamp: audioData.Timestamp, + } + if err := ws.WriteJSON(response); err != nil { + return fmt.Errorf("websocket write error: %w", err) + } + } + return nil +} + +// Reset clears the processor's internal state +func (wap *WebSocketAudioProcessor) Reset() { + // Clear echo buffer + for i := range wap.echoBuffer { + wap.echoBuffer[i] = 0 + } + // Reset adaptive filter + for i := range wap.adaptiveFilter { + wap.adaptiveFilter[i] = 0 + } +} + +// SetNoiseGateThreshold adjusts the noise gate sensitivity +func (wap *WebSocketAudioProcessor) SetNoiseGateThreshold(threshold float32) { + wap.noiseGateThreshold = threshold +} + +// SetLearningRate adjusts the adaptive filter learning rate +func (wap *WebSocketAudioProcessor) SetLearningRate(rate float32) { + wap.learningRate = rate +} diff --git a/pkg/voice/websocket_jitter.go b/pkg/voice/websocket_jitter.go new file mode 100644 index 0000000..54c957b --- /dev/null +++ b/pkg/voice/websocket_jitter.go @@ -0,0 +1,327 @@ +package voice + +import ( + "fmt" + "log" + "sync" + "time" +) + +// WebSocketJitterBuffer provides adaptive jitter buffering for WebSocket audio +type WebSocketJitterBuffer struct { + // Configuration + targetDelay time.Duration + maxDelay time.Duration + minDelay time.Duration + sampleRate int + + // Buffer management + audioBuffer [][]float32 + bufferMutex sync.RWMutex + + // Timing control + lastWriteTime time.Time + lastReadTime time.Time + readInterval time.Duration + + // Adaptive delay + currentDelay time.Duration + delayMutex sync.RWMutex + + // Control + running bool + runMutex sync.RWMutex + + // Statistics + packetsReceived int64 + packetsDropped int64 + underruns int64 + overruns int64 + + // Read ticker for consistent output + ticker *time.Ticker + outputChan chan []float32 +} + +// WebSocketJitterConfig holds configuration for WebSocket jitter buffer +type WebSocketJitterConfig struct { + SampleRate int // Audio sample rate (16000 for Vapi) + MinDelay time.Duration // Minimum buffer delay + MaxDelay time.Duration // Maximum buffer delay + TargetDelay time.Duration // Initial target delay + PacketInterval time.Duration // Expected packet interval (20ms for Vapi) +} + +// DefaultWebSocketJitterConfig returns optimized config for Vapi WebSocket +func DefaultWebSocketJitterConfig() *WebSocketJitterConfig { + return &WebSocketJitterConfig{ + SampleRate: 16000, + MinDelay: 40 * time.Millisecond, // Minimum 40ms buffering + MaxDelay: 200 * time.Millisecond, // Maximum 200ms buffering + TargetDelay: 80 * time.Millisecond, // Target 80ms - good for voice + PacketInterval: 20 * time.Millisecond, // Vapi sends 20ms packets + } +} + +// NewWebSocketJitterBuffer creates a new WebSocket-compatible jitter buffer +func NewWebSocketJitterBuffer(config *WebSocketJitterConfig) (*WebSocketJitterBuffer, error) { + if config == nil { + config = DefaultWebSocketJitterConfig() + } + + jb := &WebSocketJitterBuffer{ + targetDelay: config.TargetDelay, + maxDelay: config.MaxDelay, + minDelay: config.MinDelay, + sampleRate: config.SampleRate, + currentDelay: config.TargetDelay, + readInterval: config.PacketInterval, + audioBuffer: make([][]float32, 0, 50), // Pre-allocate for ~1 second + outputChan: make(chan []float32, 10), + } + + return jb, nil +} + +// WriteAudio adds audio samples to the jitter buffer +func (jb *WebSocketJitterBuffer) WriteAudio(samples []float32) error { + if !jb.IsRunning() { + return fmt.Errorf("jitter buffer not running") + } + + now := time.Now() + + jb.bufferMutex.Lock() + defer jb.bufferMutex.Unlock() + + // Copy samples to avoid any reference issues + sampleCopy := make([]float32, len(samples)) + copy(sampleCopy, samples) + + // Add to buffer + jb.audioBuffer = append(jb.audioBuffer, sampleCopy) + jb.packetsReceived++ + jb.lastWriteTime = now + + // Check for buffer overflow + maxBufferSize := int(jb.maxDelay / jb.readInterval) + if len(jb.audioBuffer) > maxBufferSize { + // Drop oldest packet + jb.audioBuffer = jb.audioBuffer[1:] + jb.overruns++ + if jb.overruns%25 == 0 { + log.Printf("āš ļø Jitter buffer overrun: dropped oldest packet (total: %d)", jb.overruns) + } + } + + // Adaptive delay adjustment based on buffer fill + jb.adjustDelay() + + return nil +} + +// adjustDelay adapts the buffer delay based on current conditions +func (jb *WebSocketJitterBuffer) adjustDelay() { + bufferSize := len(jb.audioBuffer) + targetBufferSize := int(jb.targetDelay / jb.readInterval) + + jb.delayMutex.Lock() + defer jb.delayMutex.Unlock() + + // Adjust target delay based on buffer fill + if bufferSize < targetBufferSize/2 { + // Buffer running low - increase delay slightly + jb.currentDelay += 5 * time.Millisecond + if jb.currentDelay > jb.maxDelay { + jb.currentDelay = jb.maxDelay + } + } else if bufferSize > targetBufferSize*2 { + // Buffer getting too full - decrease delay slightly + jb.currentDelay -= 5 * time.Millisecond + if jb.currentDelay < jb.minDelay { + jb.currentDelay = jb.minDelay + } + } +} + +// ReadAudio reads processed audio samples from the jitter buffer +func (jb *WebSocketJitterBuffer) ReadAudio(numSamples int) []float32 { + if !jb.IsRunning() { + return make([]float32, numSamples) // Return silence + } + + // Try to get samples from output channel with timeout + select { + case samples := <-jb.outputChan: + // Resize to requested length if needed + if len(samples) == numSamples { + return samples + } + + result := make([]float32, numSamples) + if len(samples) > 0 { + copy(result, samples) + } + return result + + case <-time.After(10 * time.Millisecond): + // Timeout - return silence to prevent blocking + jb.underruns++ + if jb.underruns%50 == 0 { + log.Printf("āš ļø Jitter buffer underrun: no data available (total: %d)", jb.underruns) + } + return make([]float32, numSamples) + } +} + +// Start begins jitter buffer operation +func (jb *WebSocketJitterBuffer) Start() error { + jb.runMutex.Lock() + defer jb.runMutex.Unlock() + + if jb.running { + return fmt.Errorf("jitter buffer already running") + } + + jb.running = true + + // Start read ticker for consistent output timing + jb.ticker = time.NewTicker(jb.readInterval) + go jb.readLoop() + + // Start stats monitoring + go jb.monitorStats() + + log.Printf("šŸŽµ WebSocket Jitter Buffer started (target delay: %v, interval: %v)", + jb.targetDelay, jb.readInterval) + return nil +} + +// readLoop continuously reads from buffer and outputs at regular intervals +func (jb *WebSocketJitterBuffer) readLoop() { + defer jb.ticker.Stop() + + initialDelay := jb.currentDelay + log.Printf("šŸŽµ Jitter buffer starting with %v initial delay", initialDelay) + + // Initial delay before starting to read + time.Sleep(initialDelay) + + for range jb.ticker.C { + if !jb.IsRunning() { + return + } + + jb.bufferMutex.RLock() + bufferLen := len(jb.audioBuffer) + + if bufferLen > 0 { + // Get the oldest packet + samples := jb.audioBuffer[0] + + // Remove from buffer + jb.bufferMutex.RUnlock() + jb.bufferMutex.Lock() + if len(jb.audioBuffer) > 0 { + jb.audioBuffer = jb.audioBuffer[1:] + } + jb.bufferMutex.Unlock() + + // Send to output channel (non-blocking) + select { + case jb.outputChan <- samples: + jb.lastReadTime = time.Now() + default: + // Output channel full - drop this packet + jb.packetsDropped++ + } + } else { + jb.bufferMutex.RUnlock() + // Buffer empty - output silence + silence := make([]float32, 320) // 20ms at 16kHz + select { + case jb.outputChan <- silence: + default: + // Output channel full - just skip + } + } + } +} + +// monitorStats logs periodic statistics +func (jb *WebSocketJitterBuffer) monitorStats() { + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for range ticker.C { + if !jb.IsRunning() { + return + } + jb.logStats() + } +} + +// logStats logs current buffer statistics +func (jb *WebSocketJitterBuffer) logStats() { + jb.bufferMutex.RLock() + bufferSize := len(jb.audioBuffer) + jb.bufferMutex.RUnlock() + + jb.delayMutex.RLock() + currentDelay := jb.currentDelay + jb.delayMutex.RUnlock() + + outputQueueSize := len(jb.outputChan) + + log.Printf("šŸ“Š WebSocket Jitter Buffer Stats: Buffer: %d packets, Delay: %v, Output queue: %d/10, Received: %d, Dropped: %d, Underruns: %d, Overruns: %d", + bufferSize, currentDelay, outputQueueSize, jb.packetsReceived, jb.packetsDropped, jb.underruns, jb.overruns) +} + +// Stop stops the jitter buffer +func (jb *WebSocketJitterBuffer) Stop() error { + jb.runMutex.Lock() + defer jb.runMutex.Unlock() + + if !jb.running { + return nil + } + + jb.running = false + + if jb.ticker != nil { + jb.ticker.Stop() + } + + log.Printf("šŸŽµ WebSocket Jitter Buffer stopped") + return nil +} + +// IsRunning returns true if the jitter buffer is running +func (jb *WebSocketJitterBuffer) IsRunning() bool { + jb.runMutex.RLock() + defer jb.runMutex.RUnlock() + return jb.running +} + +// GetStats returns current jitter buffer statistics +func (jb *WebSocketJitterBuffer) GetStats() map[string]interface{} { + jb.bufferMutex.RLock() + bufferSize := len(jb.audioBuffer) + jb.bufferMutex.RUnlock() + + jb.delayMutex.RLock() + currentDelay := jb.currentDelay + jb.delayMutex.RUnlock() + + return map[string]interface{}{ + "buffer_size": bufferSize, + "current_delay_ms": currentDelay.Milliseconds(), + "target_delay_ms": jb.targetDelay.Milliseconds(), + "packets_received": jb.packetsReceived, + "packets_dropped": jb.packetsDropped, + "underruns": jb.underruns, + "overruns": jb.overruns, + "output_queue_size": len(jb.outputChan), + "running": jb.IsRunning(), + } +} diff --git a/sample-assistant.json b/sample-assistant.json new file mode 100644 index 0000000..55e9030 --- /dev/null +++ b/sample-assistant.json @@ -0,0 +1,8 @@ +{ + "assistant_id": "550e8400-e29b-41d4-a716-446655440000", + "name": "Test Assistant", + "first_message": "Hello! I'm your test assistant. How can I help you today?", + "voice_id": "elliot", + "model": "gpt-4o", + "system_message": "You are a helpful and friendly assistant. Keep responses concise and conversational." +} \ No newline at end of file diff --git a/scripts/install.ps1 b/scripts/install.ps1 index 260ee4e..cbbdb6f 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -261,6 +261,12 @@ function Main { Install-Vapi $version $platform Add-ToPath Test-Installation + + Write-Host "" + Write-Info "Note: For voice features (microphone/speaker), PortAudio must be installed." + Write-Host " Windows options:" -ForegroundColor White + Write-Host " - Install via vcpkg: vcpkg install portaudio" -ForegroundColor White + Write-Host " - Or download binaries and ensure portaudio.dll is alongside vapi.exe or on PATH" -ForegroundColor White } # Run main function diff --git a/scripts/install.sh b/scripts/install.sh index d208416..3712e5b 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -170,6 +170,23 @@ tildify() { success "vapi was installed successfully to $Bold_Green$(tildify "$exe")" +# PortAudio notice for voice features +echo +info "Note: For voice features (microphone/speaker), PortAudio must be installed." +case $platform in + 'Darwin x86_64'|'Darwin arm64') + info_bold " brew install portaudio" + ;; + 'Linux x86_64'|'Linux arm64'|'Linux aarch64') + info_bold " Debian/Ubuntu: sudo apt-get update && sudo apt-get install -y libportaudio2 portaudio19-dev" + info_bold " Fedora/RHEL: sudo dnf install -y portaudio portaudio-devel" + info_bold " Arch: sudo pacman -S portaudio" + ;; + 'MINGW64'* ) + info_bold " Windows: Install PortAudio and ensure portaudio.dll is on PATH (e.g., via vcpkg: vcpkg install portaudio)" + ;; +esac + if command -v vapi >/dev/null; then echo "Run 'vapi --help' to get started" exit diff --git a/transient-assistant.json b/transient-assistant.json new file mode 100644 index 0000000..a51642f --- /dev/null +++ b/transient-assistant.json @@ -0,0 +1,7 @@ +{ + "name": "Config-based Assistant", + "first_message": "Hi! I was created from a configuration file. What would you like to talk about?", + "voice_id": "jennifer", + "model": "gpt-4o", + "system_message": "You are an intelligent assistant created from a JSON configuration. Be helpful, creative, and engaging in conversations." +} \ No newline at end of file