From fab1ded3dc97415d19919a1769696e970a4ce4c5 Mon Sep 17 00:00:00 2001
From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com>
Date: Tue, 18 Mar 2025 23:57:19 +0000
Subject: [PATCH] Add Voice Command Interface to CompUse

---
 README.md        |  23 ++-
 requirements.txt |   4 +
 voice_cli.py     | 308 ++++++++++++++++++++++++++++++++++++++++
 voice_tools.py   | 358 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 692 insertions(+), 1 deletion(-)
 create mode 100644 voice_cli.py
 create mode 100644 voice_tools.py

diff --git a/README.md b/README.md
index 021f74e..d214e82 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ nice GUI tools for the agent to use. and pull puppetter MCP server to use its to
 
 - Desktop GUI automation with PyAutoGUI
 - Web browser automation with Puppeteer MCP
-- Voice (TODO) /text-based computer control
+- Voice and text-based computer control
 - Screenshot-based interaction (may be need to figure out things like bounding box etc to localize buttons windows)
 - Cross-platform support (macOS, Windows, Linux) -- haven't tested on windows..
 
@@ -43,8 +43,29 @@ npm install -g @modelcontextprotocol/server-puppeteer
    python agent.py
    ```
 
+3. Start the voice-enabled agent (desktop + browser + voice commands):
+   ```bash
+   python voice_cli.py
+   ```
+
+## Voice Command Options
+
+The voice command interface supports several options:
+
+- `--no-voice`: Disable voice commands (use text input only)
+- `--push-to-talk`: Use push-to-talk mode (Ctrl+Space) instead of continuous listening
+- `--no-wake-word`: Disable wake word detection (listen for all speech)
+- `--wake-word WORD`: Set a custom wake word (default: "computer")
+- `-v, --verbose`: Enable verbose logging
+
+Example:
+```bash
+python voice_cli.py --wake-word "assistant" --verbose
+```
+
 ## Requirements
 
 - Python 3.7+
 - Node.js 16+
 - OpenAI API key (set in .env file)
+- For voice commands: microphone and speakers
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 84b8ec2..5754834 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,7 @@ pytest-asyncio
 rich>=13.3.5
 prompt_toolkit
 pyobjc-framework-Cocoa>=8.5; platform_system == "Darwin"
+# Voice command dependencies
+SpeechRecognition>=3.10.0
+pyttsx3>=2.90
+keyboard>=0.13.5
diff --git a/voice_cli.py b/voice_cli.py
new file mode 100644
index 0000000..3e5f5a6
--- /dev/null
+++ b/voice_cli.py
@@ -0,0 +1,308 @@
+from __future__ import annotations
+
+import os
+import asyncio
+import logging
+import argparse
+from typing import Optional, Dict, Any
+import readline
+from pathlib import Path
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress
+from rich.syntax import Syntax
+from rich.prompt import Prompt
+from rich import print as rprint
+from rich.theme import Theme
+from rich.table import Table
+
+from cli import CLI
+from agent_manager import AgentManager
+from voice_tools import AsyncVoiceCommandManager, VoiceCommandConfig
+from dotenv import load_dotenv
+
+# Add this before any other code execution (right after imports)
+parser = argparse.ArgumentParser()
+parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
+parser.add_argument('--no-voice', action='store_true', help='Disable voice commands')
+parser.add_argument('--push-to-talk', action='store_true', help='Use push-to-talk instead of continuous listening')
+parser.add_argument('--no-wake-word', action='store_true', help='Disable wake word detection')
+parser.add_argument('--wake-word', type=str, default='computer', help='Set custom wake word (default: "computer")')
+args = parser.parse_args()
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO if args.verbose else logging.WARNING,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+# Setup Rich console with custom theme (same as in cli.py)
+custom_theme = Theme({
+    "info": "grey70",
+    "warning": "yellow",
+    "error": "red",
+    "success": "grey74",
+    "command": "bold blue",
+    "highlight": "dark_orange3",
+})
+
+console = Console(theme=custom_theme)
+
+class VoiceCLI(CLI):
+    """Extends the CLI with voice command capabilities."""
+    
+    def __init__(self):
+        super().__init__()
+        self.console = Console(theme=custom_theme)
+        self.commands.extend(["/voice", "/voice:on", "/voice:off", "/voice:config"])
+        self.voice_manager = None
+        self.voice_enabled = not args.no_voice
+        
+        # Configure voice recognition
+        self.voice_config = VoiceCommandConfig(
+            continuous_listening=not args.push_to_talk,
+            use_wake_word=not args.no_wake_word,
+            wake_word=args.wake_word
+        )
+    
+    def display_help(self):
+        """Display help information with voice commands."""
+        help_table = Table(title="CompUse CLI Commands")
+        help_table.add_column("Command", style="command")
+        help_table.add_column("Description", style="info")
+        
+        help_table.add_row("/help", "Display this help message")
+        help_table.add_row("/tools", "List all available tools")
+        help_table.add_row("/history", "Show conversation history")
+        help_table.add_row("/clear", "Clear the screen")
+        help_table.add_row("/reset", "Reset the conversation history")
+        help_table.add_row("/voice", "Show voice command status")
+        help_table.add_row("/voice:on", "Enable voice commands")
+        help_table.add_row("/voice:off", "Disable voice commands")
+        help_table.add_row("/voice:config", "Show voice command configuration")
+        help_table.add_row("/exit, /quit, /bye", "Exit the application")
+        
+        self.console.print(help_table)
+    
+    def display_voice_status(self):
+        """Display voice command status."""
+        status_table = Table(title="Voice Command Status")
+        status_table.add_column("Setting", style="command")
+        status_table.add_column("Value", style="info")
+        
+        status_table.add_row("Enabled", "✅ Yes" if self.voice_enabled else "❌ No")
+        if self.voice_enabled:
+            status_table.add_row("Listening Mode", 
+                                "Continuous" if self.voice_config.continuous_listening else 
+                                f"Push-to-talk ({self.voice_config.push_to_talk_key})")
+            status_table.add_row("Wake Word", 
+                                f"'{self.voice_config.wake_word}'" if self.voice_config.use_wake_word else "Disabled")
+            status_table.add_row("Language", self.voice_config.language)
+        
+        self.console.print(status_table)
+    
+    def display_voice_config(self):
+        """Display detailed voice command configuration."""
+        config_table = Table(title="Voice Command Configuration")
+        config_table.add_column("Setting", style="command")
+        config_table.add_column("Value", style="info")
+        
+        # Add all config parameters
+        for key, value in self.voice_config.__dict__.items():
+            config_table.add_row(key, str(value))
+        
+        self.console.print(config_table)
+
+
+async def run_voice_cli():
+    """Main CLI function with voice command support."""
+    load_dotenv()
+    
+    # Initialize CLI
+    cli = VoiceCLI()
+    cli.setup_history()
+    
+    # Display welcome banner
+    cli.console.print(Panel.fit(
+        "[grey70]CompUse Voice CLI[/grey70]\n"
+        "[grey70]Desktop & Browser Automation Assistant with Voice Control[/grey70]",
+        border_style="grey70"
+    ))
+    
+    agent_manager = None
+    voice_manager = None
+    
+    try:
+        # Initialize agent manager
+        with cli.console.status("[dark_orange3]Initializing agent...", spinner="dots"):
+            agent_manager = await AgentManager.initialize()
+        
+        # Initialize voice command manager if enabled
+        if cli.voice_enabled:
+            with cli.console.status("[dark_orange3]Initializing voice recognition...", spinner="dots"):
+                voice_manager = AsyncVoiceCommandManager(cli.voice_config)
+                await voice_manager.start()
+                cli.voice_manager = voice_manager
+        
+        # Display initial information
+        cli.console.print(agent_manager.get_tools_table())
+        cli.console.print("[success]Combined Agent initialized with both PyAutoGUI and MCP tools![/success]")
+        
+        if cli.voice_enabled:
+            cli.console.print("[success]Voice command interface activated![/success]")
+            if cli.voice_config.use_wake_word:
+                cli.console.print(f"[info]Say '[bold]{cli.voice_config.wake_word}[/bold]' followed by your command.[/info]")
+            else:
+                cli.console.print("[info]Voice commands are active. Just speak your command.[/info]")
+        
+        cli.console.print("[info]You can also type commands. Type [bold]/help[/bold] for available commands or [bold]/exit[/bold] to quit.[/info]")
+        
+        # Main loop
+        while True:
+            try:
+                # Check for voice commands if enabled
+                voice_command = None
+                if cli.voice_enabled and voice_manager:
+                    # Use non-blocking check for voice commands
+                    voice_command = voice_manager.get_next_command_nowait()
+                
+                if voice_command:
+                    # Process voice command
+                    cli.console.print(f"[bold cyan]Voice command:[/bold cyan] {voice_command}")
+                    
+                    # Check if it's a CLI command
+                    if voice_command.lower().startswith(("exit", "quit", "bye")):
+                        cli.console.print("[info]Exit command received. Shutting down...[/info]")
+                        break
+                    
+                    # Process with agent
+                    with cli.console.status("[dark_orange3] ", spinner="point") as status:
+                        result, elapsed = await agent_manager.run_command(voice_command)
+                        await asyncio.sleep(0.5)
+                    
+                    # Display result
+                    cli.console.print(f"{result}")
+                    
+                # Check for text input (non-blocking)
+                await asyncio.sleep(0.1)  # Small delay to prevent CPU hogging
+                
+                # Save history before each command
+                try:
+                    readline.write_history_file(cli.history_file)
+                except (PermissionError, OSError):
+                    pass
+                
+                # Use a prompt that doesn't block the event loop
+                print()
+                prompt = "\033[1;32m > \033[0m"
+                
+                # This is a blocking call, but we need it for text input
+                # In a more sophisticated implementation, we could use asyncio.create_subprocess_exec
+                # to run a separate process for input, but that's beyond the scope of this example
+                user_input = input(prompt)
+                
+                if not user_input:
+                    continue
+                
+                if user_input.lower() in ['/exit', '/quit', '/bye', 'exit', 'quit', 'bye']:
+                    cli.console.print("[info]Shutting down...[/info]")
+                    break
+                
+                elif user_input.lower() == '/help':
+                    cli.display_help()
+                    continue
+                
+                elif user_input.lower() == '/clear':
+                    cli.console.clear()
+                    continue
+                
+                elif user_input.lower() == '/reset':
+                    agent_manager.reset_history()
+                    cli.console.print("[success]Conversation history has been reset[/success]")
+                    continue
+                
+                elif user_input.lower() == '/tools':
+                    cli.console.print(agent_manager.get_tools_table())
+                    continue
+                
+                elif user_input.lower() == '/history':
+                    cli.console.print(agent_manager.get_history_table())
+                    continue
+                
+                elif user_input.lower() == '/voice':
+                    cli.display_voice_status()
+                    continue
+                
+                elif user_input.lower() == '/voice:config':
+                    cli.display_voice_config()
+                    continue
+                
+                elif user_input.lower() == '/voice:on':
+                    if not cli.voice_enabled:
+                        cli.voice_enabled = True
+                        if not voice_manager:
+                            voice_manager = AsyncVoiceCommandManager(cli.voice_config)
+                        await voice_manager.start()
+                        cli.voice_manager = voice_manager
+                        cli.console.print("[success]Voice commands enabled[/success]")
+                    else:
+                        cli.console.print("[info]Voice commands are already enabled[/info]")
+                    continue
+                
+                elif user_input.lower() == '/voice:off':
+                    if cli.voice_enabled:
+                        cli.voice_enabled = False
+                        if voice_manager:
+                            await voice_manager.stop()
+                        cli.console.print("[success]Voice commands disabled[/success]")
+                    else:
+                        cli.console.print("[info]Voice commands are already disabled[/info]")
+                    continue
+                
+                elif user_input.startswith('/'):
+                    cli.console.print(f"[error]Command not found: {user_input}[/error]")
+                    cli.console.print("[info]Type [bold]/help[/bold] to see available commands[/info]")
+                    continue
+                
+                # Process regular commands
+                with cli.console.status("[dark_orange3] ", spinner="point") as status:
+                    result, elapsed = await agent_manager.run_command(user_input)
+                    await asyncio.sleep(0.5)
+                
+                # Display result
+                cli.console.print(f"{result}")
+                
+            except KeyboardInterrupt:
+                break
+            except Exception as e:
+                cli.console.print(f"[error]Error: {str(e)}[/error]")
+    
+    finally:
+        # Clean up voice manager
+        if voice_manager:
+            with cli.console.status("[dark_orange3]Stopping voice recognition...", spinner="dots"):
+                await voice_manager.stop()
+        
+        # Clean up agent manager
+        if agent_manager:
+            with cli.console.status("[dark_orange3]Cleaning up resources...", spinner="dots"):
+                await agent_manager.cleanup()
+                cli.console.print("[success]Resources cleaned up successfully[/success]")
+        
+        # Save history
+        try:
+            readline.write_history_file(cli.history_file)
+        except (PermissionError, OSError, Exception):
+            pass
+        
+        cli.console.print("[info]Goodbye![/info]")
+
+
+def main():
+    """Entry point for the voice CLI."""
+    asyncio.run(run_voice_cli())
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/voice_tools.py b/voice_tools.py
new file mode 100644
index 0000000..5247553
--- /dev/null
+++ b/voice_tools.py
@@ -0,0 +1,358 @@
+from __future__ import annotations
+
+import os
+import asyncio
+import threading
+import time
+import logging
+from dataclasses import dataclass
+from typing import Optional, Callable, Dict, Any, List
+import queue
+
+# Import speech recognition library
+try:
+    import speech_recognition as sr
+except ImportError:
+    raise ImportError(
+        "speech_recognition package is required for voice commands. "
+        "Install it with: pip install SpeechRecognition"
+    )
+
+# Import for audio feedback (optional)
+try:
+    import pyttsx3
+except ImportError:
+    pyttsx3 = None
+    logging.warning(
+        "pyttsx3 package not found. Audio feedback will be disabled. "
+        "Install it with: pip install pyttsx3"
+    )
+
+# Import for hotkey detection (optional)
+try:
+    import keyboard
+except ImportError:
+    keyboard = None
+    logging.warning(
+        "keyboard package not found. Push-to-talk hotkey will be disabled. "
+        "Install it with: pip install keyboard"
+    )
+
+# Set up logging
+logger = logging.getLogger(__name__)
+
+@dataclass
+class VoiceCommandConfig:
+    """Configuration for voice command recognition."""
+    # General settings
+    enabled: bool = True
+    language: str = "en-US"
+    
+    # Listening mode settings
+    continuous_listening: bool = True
+    push_to_talk_key: str = "ctrl+space"  # Only used if continuous_listening is False
+    
+    # Wake word settings
+    use_wake_word: bool = True
+    wake_word: str = "computer"
+    wake_word_timeout: int = 5  # seconds to listen after wake word
+    
+    # Recognition settings
+    energy_threshold: int = 4000  # Microphone sensitivity
+    pause_threshold: float = 0.8  # Seconds of silence to consider end of phrase
+    dynamic_energy_threshold: bool = True
+    
+    # Feedback settings
+    audio_feedback: bool = True
+    visual_feedback: bool = True
+    
+    # Advanced settings
+    timeout: int = 5  # Recognition timeout in seconds
+    phrase_time_limit: int = 10  # Max seconds for a single phrase
+
+
+class VoiceCommandListener:
+    """Handles voice command recognition and processing."""
+    
+    def __init__(self, config: VoiceCommandConfig, command_callback: Callable[[str], None]):
+        """Initialize the voice command listener.
+        
+        Args:
+            config: Configuration for voice recognition
+            command_callback: Function to call when a command is recognized
+        """
+        self.config = config
+        self.command_callback = command_callback
+        self.recognizer = sr.Recognizer()
+        self.microphone = sr.Microphone()
+        
+        # Set up recognizer properties
+        self.recognizer.energy_threshold = config.energy_threshold
+        self.recognizer.pause_threshold = config.pause_threshold
+        self.recognizer.dynamic_energy_threshold = config.dynamic_energy_threshold
+        
+        # Set up text-to-speech engine for feedback
+        self.engine = None
+        if config.audio_feedback and pyttsx3:
+            self.engine = pyttsx3.init()
+            self.engine.setProperty('rate', 150)
+            
+        # Thread control
+        self.running = False
+        self.listening_thread = None
+        self.command_queue = queue.Queue()
+        
+        # Calibrate microphone
+        with self.microphone as source:
+            logger.info("Calibrating microphone...")
+            self.recognizer.adjust_for_ambient_noise(source, duration=1)
+            logger.info(f"Microphone calibrated. Energy threshold: {self.recognizer.energy_threshold}")
+    
+    def start(self):
+        """Start the voice command listener in a background thread."""
+        if self.running:
+            logger.warning("Voice command listener is already running")
+            return
+            
+        self.running = True
+        
+        if self.config.continuous_listening:
+            self.listening_thread = threading.Thread(target=self._continuous_listening_loop)
+        else:
+            if not keyboard:
+                logger.error("Push-to-talk requires the keyboard package")
+                self.running = False
+                return
+            self.listening_thread = threading.Thread(target=self._push_to_talk_loop)
+            
+        self.listening_thread.daemon = True
+        self.listening_thread.start()
+        
+        logger.info("Voice command listener started")
+        if self.config.audio_feedback and self.engine:
+            self.engine.say("Voice commands activated")
+            self.engine.runAndWait()
+    
+    def stop(self):
+        """Stop the voice command listener."""
+        self.running = False
+        if self.listening_thread:
+            self.listening_thread.join(timeout=1.0)
+            self.listening_thread = None
+            
+        logger.info("Voice command listener stopped")
+        if self.config.audio_feedback and self.engine:
+            self.engine.say("Voice commands deactivated")
+            self.engine.runAndWait()
+    
+    def _continuous_listening_loop(self):
+        """Background thread for continuous listening."""
+        while self.running:
+            try:
+                with self.microphone as source:
+                    if self.config.use_wake_word:
+                        # Listen for wake word
+                        logger.debug("Listening for wake word...")
+                        audio = self.recognizer.listen(
+                            source, 
+                            timeout=None,
+                            phrase_time_limit=2
+                        )
+                        
+                        try:
+                            text = self.recognizer.recognize_google(
+                                audio, 
+                                language=self.config.language
+                            ).lower()
+                            
+                            if self.config.wake_word.lower() in text:
+                                logger.info(f"Wake word detected: {text}")
+                                if self.config.audio_feedback and self.engine:
+                                    self.engine.say("Listening")
+                                    self.engine.runAndWait()
+                                
+                                # Now listen for the actual command
+                                logger.debug("Listening for command...")
+                                command_audio = self.recognizer.listen(
+                                    source, 
+                                    timeout=self.config.wake_word_timeout,
+                                    phrase_time_limit=self.config.phrase_time_limit
+                                )
+                                
+                                self._process_audio(command_audio)
+                        except sr.UnknownValueError:
+                            # Wake word not recognized, continue listening
+                            pass
+                        except sr.RequestError as e:
+                            logger.error(f"Could not request results: {e}")
+                    else:
+                        # Direct listening without wake word
+                        logger.debug("Listening for command...")
+                        audio = self.recognizer.listen(
+                            source, 
+                            timeout=None,
+                            phrase_time_limit=self.config.phrase_time_limit
+                        )
+                        self._process_audio(audio)
+            except Exception as e:
+                logger.error(f"Error in continuous listening loop: {e}")
+                time.sleep(1)  # Prevent tight loop on error
+    
+    def _push_to_talk_loop(self):
+        """Background thread for push-to-talk listening."""
+        if not keyboard:
+            logger.error("Push-to-talk requires the keyboard package")
+            return
+            
+        while self.running:
+            try:
+                # Wait for hotkey press
+                keyboard.wait(self.config.push_to_talk_key)
+                
+                if not self.running:
+                    break
+                    
+                logger.info("Push-to-talk key pressed")
+                if self.config.audio_feedback and self.engine:
+                    self.engine.say("Listening")
+                    self.engine.runAndWait()
+                
+                # Listen for command
+                with self.microphone as source:
+                    logger.debug("Listening for command...")
+                    audio = self.recognizer.listen(
+                        source, 
+                        timeout=self.config.timeout,
+                        phrase_time_limit=self.config.phrase_time_limit
+                    )
+                    self._process_audio(audio)
+            except Exception as e:
+                logger.error(f"Error in push-to-talk loop: {e}")
+                time.sleep(1)  # Prevent tight loop on error
+    
+    def _process_audio(self, audio):
+        """Process audio data and extract command."""
+        try:
+            text = self.recognizer.recognize_google(
+                audio, 
+                language=self.config.language
+            )
+            
+            logger.info(f"Recognized: {text}")
+            
+            if self.config.audio_feedback and self.engine:
+                self.engine.say("Got it")
+                self.engine.runAndWait()
+                
+            # Add command to queue for processing
+            self.command_queue.put(text)
+            
+            # Call the callback function
+            self.command_callback(text)
+            
+        except sr.UnknownValueError:
+            logger.info("Could not understand audio")
+            if self.config.audio_feedback and self.engine:
+                self.engine.say("Sorry, I didn't catch that")
+                self.engine.runAndWait()
+        except sr.RequestError as e:
+            logger.error(f"Could not request results: {e}")
+            if self.config.audio_feedback and self.engine:
+                self.engine.say("Sorry, I couldn't process that")
+                self.engine.runAndWait()
+    
+    def get_next_command(self) -> Optional[str]:
+        """Get the next command from the queue, if available."""
+        try:
+            return self.command_queue.get_nowait()
+        except queue.Empty:
+            return None
+
+
+# Async wrapper for voice commands
+class AsyncVoiceCommandManager:
+    """Async wrapper for voice command listener to integrate with CompUse."""
+    
+    def __init__(self, config: Optional[VoiceCommandConfig] = None):
+        """Initialize the async voice command manager.
+        
+        Args:
+            config: Optional configuration for voice recognition
+        """
+        self.config = config or VoiceCommandConfig()
+        self.command_queue = asyncio.Queue()
+        self.listener = None
+    
+    async def start(self):
+        """Start the voice command listener."""
+        if self.listener:
+            logger.warning("Voice command listener is already running")
+            return
+            
+        # Create the listener with a callback that puts commands in the async queue
+        def command_callback(text):
+            asyncio.run_coroutine_threadsafe(
+                self.command_queue.put(text),
+                asyncio.get_event_loop()
+            )
+            
+        self.listener = VoiceCommandListener(self.config, command_callback)
+        self.listener.start()
+    
+    async def stop(self):
+        """Stop the voice command listener."""
+        if self.listener:
+            self.listener.stop()
+            self.listener = None
+    
+    async def get_next_command(self) -> str:
+        """Wait for and return the next voice command."""
+        return await self.command_queue.get()
+    
+    def get_next_command_nowait(self) -> Optional[str]:
+        """Get the next command without waiting, returns None if no command available."""
+        try:
+            return self.command_queue.get_nowait()
+        except asyncio.QueueEmpty:
+            return None
+
+
+# Example usage
+async def main():
+    """Example usage of voice command manager."""
+    # Create voice command manager with default config
+    manager = AsyncVoiceCommandManager()
+    
+    # Start listening
+    await manager.start()
+    
+    print("Voice command listener started. Say something!")
+    print("Press Ctrl+C to exit")
+    
+    try:
+        while True:
+            # Wait for next command
+            command = await manager.get_next_command()
+            print(f"Command received: {command}")
+            
+            # Process command (example)
+            if "exit" in command.lower() or "quit" in command.lower():
+                print("Exit command received, stopping...")
+                break
+    except KeyboardInterrupt:
+        print("Interrupted by user")
+    finally:
+        # Stop listening
+        await manager.stop()
+        print("Voice command listener stopped")
+
+
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    # Run the example
+    asyncio.run(main())
\ No newline at end of file