From fab1ded3dc97415d19919a1769696e970a4ce4c5 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Tue, 18 Mar 2025 23:57:19 +0000 Subject: [PATCH] Add Voice Command Interface to CompUse --- README.md | 23 ++- requirements.txt | 4 + voice_cli.py | 308 ++++++++++++++++++++++++++++++++++++++++ voice_tools.py | 358 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 voice_cli.py create mode 100644 voice_tools.py diff --git a/README.md b/README.md index 021f74e..d214e82 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ nice GUI tools for the agent to use. and pull puppetter MCP server to use its to - Desktop GUI automation with PyAutoGUI - Web browser automation with Puppeteer MCP -- Voice (TODO) /text-based computer control +- Voice and text-based computer control - Screenshot-based interaction (may be need to figure out things like bounding box etc to localize buttons windows) - Cross-platform support (macOS, Windows, Linux) -- haven't tested on windows.. @@ -43,8 +43,29 @@ npm install -g @modelcontextprotocol/server-puppeteer python agent.py ``` +3. Start the voice-enabled agent (desktop + browser + voice commands): + ```bash + python voice_cli.py + ``` + +## Voice Command Options + +The voice command interface supports several options: + +- `--no-voice`: Disable voice commands (use text input only) +- `--push-to-talk`: Use push-to-talk mode (Ctrl+Space) instead of continuous listening +- `--no-wake-word`: Disable wake word detection (listen for all speech) +- `--wake-word WORD`: Set a custom wake word (default: "computer") +- `-v, --verbose`: Enable verbose logging + +Example: +```bash +python voice_cli.py --wake-word "assistant" --verbose +``` + ## Requirements - Python 3.7+ - Node.js 16+ - OpenAI API key (set in .env file) +- For voice commands: microphone and speakers \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 84b8ec2..5754834 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,7 @@ pytest-asyncio rich>=13.3.5 prompt_toolkit pyobjc-framework-Cocoa>=8.5; platform_system == "Darwin" +# Voice command dependencies +SpeechRecognition>=3.10.0 +pyttsx3>=2.90 +keyboard>=0.13.5 diff --git a/voice_cli.py b/voice_cli.py new file mode 100644 index 0000000..3e5f5a6 --- /dev/null +++ b/voice_cli.py @@ -0,0 +1,308 @@ +from __future__ import annotations + +import os +import asyncio +import logging +import argparse +from typing import Optional, Dict, Any +import readline +from pathlib import Path + +from rich.console import Console +from rich.panel import Panel +from rich.progress import Progress +from rich.syntax import Syntax +from rich.prompt import Prompt +from rich import print as rprint +from rich.theme import Theme +from rich.table import Table + +from cli import CLI +from agent_manager import AgentManager +from voice_tools import AsyncVoiceCommandManager, VoiceCommandConfig +from dotenv import load_dotenv + +# Add this before any other code execution (right after imports) +parser = argparse.ArgumentParser() +parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging') +parser.add_argument('--no-voice', action='store_true', help='Disable voice commands') +parser.add_argument('--push-to-talk', action='store_true', help='Use push-to-talk instead of continuous listening') +parser.add_argument('--no-wake-word', action='store_true', help='Disable wake word detection') +parser.add_argument('--wake-word', type=str, default='computer', help='Set custom wake word (default: "computer")') +args = parser.parse_args() + +# Configure logging +logging.basicConfig( + level=logging.INFO if args.verbose else logging.WARNING, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Setup Rich console with custom theme (same as in cli.py) +custom_theme = Theme({ + "info": "grey70", + "warning": "yellow", + "error": "red", + "success": "grey74", + "command": "bold blue", + "highlight": "dark_orange3", +}) + +console = Console(theme=custom_theme) + +class VoiceCLI(CLI): + """Extends the CLI with voice command capabilities.""" + + def __init__(self): + super().__init__() + self.console = Console(theme=custom_theme) + self.commands.extend(["/voice", "/voice:on", "/voice:off", "/voice:config"]) + self.voice_manager = None + self.voice_enabled = not args.no_voice + + # Configure voice recognition + self.voice_config = VoiceCommandConfig( + continuous_listening=not args.push_to_talk, + use_wake_word=not args.no_wake_word, + wake_word=args.wake_word + ) + + def display_help(self): + """Display help information with voice commands.""" + help_table = Table(title="CompUse CLI Commands") + help_table.add_column("Command", style="command") + help_table.add_column("Description", style="info") + + help_table.add_row("/help", "Display this help message") + help_table.add_row("/tools", "List all available tools") + help_table.add_row("/history", "Show conversation history") + help_table.add_row("/clear", "Clear the screen") + help_table.add_row("/reset", "Reset the conversation history") + help_table.add_row("/voice", "Show voice command status") + help_table.add_row("/voice:on", "Enable voice commands") + help_table.add_row("/voice:off", "Disable voice commands") + help_table.add_row("/voice:config", "Show voice command configuration") + help_table.add_row("/exit, /quit, /bye", "Exit the application") + + self.console.print(help_table) + + def display_voice_status(self): + """Display voice command status.""" + status_table = Table(title="Voice Command Status") + status_table.add_column("Setting", style="command") + status_table.add_column("Value", style="info") + + status_table.add_row("Enabled", "✅ Yes" if self.voice_enabled else "❌ No") + if self.voice_enabled: + status_table.add_row("Listening Mode", + "Continuous" if self.voice_config.continuous_listening else + f"Push-to-talk ({self.voice_config.push_to_talk_key})") + status_table.add_row("Wake Word", + f"'{self.voice_config.wake_word}'" if self.voice_config.use_wake_word else "Disabled") + status_table.add_row("Language", self.voice_config.language) + + self.console.print(status_table) + + def display_voice_config(self): + """Display detailed voice command configuration.""" + config_table = Table(title="Voice Command Configuration") + config_table.add_column("Setting", style="command") + config_table.add_column("Value", style="info") + + # Add all config parameters + for key, value in self.voice_config.__dict__.items(): + config_table.add_row(key, str(value)) + + self.console.print(config_table) + + +async def run_voice_cli(): + """Main CLI function with voice command support.""" + load_dotenv() + + # Initialize CLI + cli = VoiceCLI() + cli.setup_history() + + # Display welcome banner + cli.console.print(Panel.fit( + "[grey70]CompUse Voice CLI[/grey70]\n" + "[grey70]Desktop & Browser Automation Assistant with Voice Control[/grey70]", + border_style="grey70" + )) + + agent_manager = None + voice_manager = None + + try: + # Initialize agent manager + with cli.console.status("[dark_orange3]Initializing agent...", spinner="dots"): + agent_manager = await AgentManager.initialize() + + # Initialize voice command manager if enabled + if cli.voice_enabled: + with cli.console.status("[dark_orange3]Initializing voice recognition...", spinner="dots"): + voice_manager = AsyncVoiceCommandManager(cli.voice_config) + await voice_manager.start() + cli.voice_manager = voice_manager + + # Display initial information + cli.console.print(agent_manager.get_tools_table()) + cli.console.print("[success]Combined Agent initialized with both PyAutoGUI and MCP tools![/success]") + + if cli.voice_enabled: + cli.console.print("[success]Voice command interface activated![/success]") + if cli.voice_config.use_wake_word: + cli.console.print(f"[info]Say '[bold]{cli.voice_config.wake_word}[/bold]' followed by your command.[/info]") + else: + cli.console.print("[info]Voice commands are active. Just speak your command.[/info]") + + cli.console.print("[info]You can also type commands. Type [bold]/help[/bold] for available commands or [bold]/exit[/bold] to quit.[/info]") + + # Main loop + while True: + try: + # Check for voice commands if enabled + voice_command = None + if cli.voice_enabled and voice_manager: + # Use non-blocking check for voice commands + voice_command = voice_manager.get_next_command_nowait() + + if voice_command: + # Process voice command + cli.console.print(f"[bold cyan]Voice command:[/bold cyan] {voice_command}") + + # Check if it's a CLI command + if voice_command.lower().startswith(("exit", "quit", "bye")): + cli.console.print("[info]Exit command received. Shutting down...[/info]") + break + + # Process with agent + with cli.console.status("[dark_orange3] ", spinner="point") as status: + result, elapsed = await agent_manager.run_command(voice_command) + await asyncio.sleep(0.5) + + # Display result + cli.console.print(f"{result}") + + # Check for text input (non-blocking) + await asyncio.sleep(0.1) # Small delay to prevent CPU hogging + + # Save history before each command + try: + readline.write_history_file(cli.history_file) + except (PermissionError, OSError): + pass + + # Use a prompt that doesn't block the event loop + print() + prompt = "\033[1;32m > \033[0m" + + # This is a blocking call, but we need it for text input + # In a more sophisticated implementation, we could use asyncio.create_subprocess_exec + # to run a separate process for input, but that's beyond the scope of this example + user_input = input(prompt) + + if not user_input: + continue + + if user_input.lower() in ['/exit', '/quit', '/bye', 'exit', 'quit', 'bye']: + cli.console.print("[info]Shutting down...[/info]") + break + + elif user_input.lower() == '/help': + cli.display_help() + continue + + elif user_input.lower() == '/clear': + cli.console.clear() + continue + + elif user_input.lower() == '/reset': + agent_manager.reset_history() + cli.console.print("[success]Conversation history has been reset[/success]") + continue + + elif user_input.lower() == '/tools': + cli.console.print(agent_manager.get_tools_table()) + continue + + elif user_input.lower() == '/history': + cli.console.print(agent_manager.get_history_table()) + continue + + elif user_input.lower() == '/voice': + cli.display_voice_status() + continue + + elif user_input.lower() == '/voice:config': + cli.display_voice_config() + continue + + elif user_input.lower() == '/voice:on': + if not cli.voice_enabled: + cli.voice_enabled = True + if not voice_manager: + voice_manager = AsyncVoiceCommandManager(cli.voice_config) + await voice_manager.start() + cli.voice_manager = voice_manager + cli.console.print("[success]Voice commands enabled[/success]") + else: + cli.console.print("[info]Voice commands are already enabled[/info]") + continue + + elif user_input.lower() == '/voice:off': + if cli.voice_enabled: + cli.voice_enabled = False + if voice_manager: + await voice_manager.stop() + cli.console.print("[success]Voice commands disabled[/success]") + else: + cli.console.print("[info]Voice commands are already disabled[/info]") + continue + + elif user_input.startswith('/'): + cli.console.print(f"[error]Command not found: {user_input}[/error]") + cli.console.print("[info]Type [bold]/help[/bold] to see available commands[/info]") + continue + + # Process regular commands + with cli.console.status("[dark_orange3] ", spinner="point") as status: + result, elapsed = await agent_manager.run_command(user_input) + await asyncio.sleep(0.5) + + # Display result + cli.console.print(f"{result}") + + except KeyboardInterrupt: + break + except Exception as e: + cli.console.print(f"[error]Error: {str(e)}[/error]") + + finally: + # Clean up voice manager + if voice_manager: + with cli.console.status("[dark_orange3]Stopping voice recognition...", spinner="dots"): + await voice_manager.stop() + + # Clean up agent manager + if agent_manager: + with cli.console.status("[dark_orange3]Cleaning up resources...", spinner="dots"): + await agent_manager.cleanup() + cli.console.print("[success]Resources cleaned up successfully[/success]") + + # Save history + try: + readline.write_history_file(cli.history_file) + except (PermissionError, OSError, Exception): + pass + + cli.console.print("[info]Goodbye![/info]") + + +def main(): + """Entry point for the voice CLI.""" + asyncio.run(run_voice_cli()) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/voice_tools.py b/voice_tools.py new file mode 100644 index 0000000..5247553 --- /dev/null +++ b/voice_tools.py @@ -0,0 +1,358 @@ +from __future__ import annotations + +import os +import asyncio +import threading +import time +import logging +from dataclasses import dataclass +from typing import Optional, Callable, Dict, Any, List +import queue + +# Import speech recognition library +try: + import speech_recognition as sr +except ImportError: + raise ImportError( + "speech_recognition package is required for voice commands. " + "Install it with: pip install SpeechRecognition" + ) + +# Import for audio feedback (optional) +try: + import pyttsx3 +except ImportError: + pyttsx3 = None + logging.warning( + "pyttsx3 package not found. Audio feedback will be disabled. " + "Install it with: pip install pyttsx3" + ) + +# Import for hotkey detection (optional) +try: + import keyboard +except ImportError: + keyboard = None + logging.warning( + "keyboard package not found. Push-to-talk hotkey will be disabled. " + "Install it with: pip install keyboard" + ) + +# Set up logging +logger = logging.getLogger(__name__) + +@dataclass +class VoiceCommandConfig: + """Configuration for voice command recognition.""" + # General settings + enabled: bool = True + language: str = "en-US" + + # Listening mode settings + continuous_listening: bool = True + push_to_talk_key: str = "ctrl+space" # Only used if continuous_listening is False + + # Wake word settings + use_wake_word: bool = True + wake_word: str = "computer" + wake_word_timeout: int = 5 # seconds to listen after wake word + + # Recognition settings + energy_threshold: int = 4000 # Microphone sensitivity + pause_threshold: float = 0.8 # Seconds of silence to consider end of phrase + dynamic_energy_threshold: bool = True + + # Feedback settings + audio_feedback: bool = True + visual_feedback: bool = True + + # Advanced settings + timeout: int = 5 # Recognition timeout in seconds + phrase_time_limit: int = 10 # Max seconds for a single phrase + + +class VoiceCommandListener: + """Handles voice command recognition and processing.""" + + def __init__(self, config: VoiceCommandConfig, command_callback: Callable[[str], None]): + """Initialize the voice command listener. + + Args: + config: Configuration for voice recognition + command_callback: Function to call when a command is recognized + """ + self.config = config + self.command_callback = command_callback + self.recognizer = sr.Recognizer() + self.microphone = sr.Microphone() + + # Set up recognizer properties + self.recognizer.energy_threshold = config.energy_threshold + self.recognizer.pause_threshold = config.pause_threshold + self.recognizer.dynamic_energy_threshold = config.dynamic_energy_threshold + + # Set up text-to-speech engine for feedback + self.engine = None + if config.audio_feedback and pyttsx3: + self.engine = pyttsx3.init() + self.engine.setProperty('rate', 150) + + # Thread control + self.running = False + self.listening_thread = None + self.command_queue = queue.Queue() + + # Calibrate microphone + with self.microphone as source: + logger.info("Calibrating microphone...") + self.recognizer.adjust_for_ambient_noise(source, duration=1) + logger.info(f"Microphone calibrated. Energy threshold: {self.recognizer.energy_threshold}") + + def start(self): + """Start the voice command listener in a background thread.""" + if self.running: + logger.warning("Voice command listener is already running") + return + + self.running = True + + if self.config.continuous_listening: + self.listening_thread = threading.Thread(target=self._continuous_listening_loop) + else: + if not keyboard: + logger.error("Push-to-talk requires the keyboard package") + self.running = False + return + self.listening_thread = threading.Thread(target=self._push_to_talk_loop) + + self.listening_thread.daemon = True + self.listening_thread.start() + + logger.info("Voice command listener started") + if self.config.audio_feedback and self.engine: + self.engine.say("Voice commands activated") + self.engine.runAndWait() + + def stop(self): + """Stop the voice command listener.""" + self.running = False + if self.listening_thread: + self.listening_thread.join(timeout=1.0) + self.listening_thread = None + + logger.info("Voice command listener stopped") + if self.config.audio_feedback and self.engine: + self.engine.say("Voice commands deactivated") + self.engine.runAndWait() + + def _continuous_listening_loop(self): + """Background thread for continuous listening.""" + while self.running: + try: + with self.microphone as source: + if self.config.use_wake_word: + # Listen for wake word + logger.debug("Listening for wake word...") + audio = self.recognizer.listen( + source, + timeout=None, + phrase_time_limit=2 + ) + + try: + text = self.recognizer.recognize_google( + audio, + language=self.config.language + ).lower() + + if self.config.wake_word.lower() in text: + logger.info(f"Wake word detected: {text}") + if self.config.audio_feedback and self.engine: + self.engine.say("Listening") + self.engine.runAndWait() + + # Now listen for the actual command + logger.debug("Listening for command...") + command_audio = self.recognizer.listen( + source, + timeout=self.config.wake_word_timeout, + phrase_time_limit=self.config.phrase_time_limit + ) + + self._process_audio(command_audio) + except sr.UnknownValueError: + # Wake word not recognized, continue listening + pass + except sr.RequestError as e: + logger.error(f"Could not request results: {e}") + else: + # Direct listening without wake word + logger.debug("Listening for command...") + audio = self.recognizer.listen( + source, + timeout=None, + phrase_time_limit=self.config.phrase_time_limit + ) + self._process_audio(audio) + except Exception as e: + logger.error(f"Error in continuous listening loop: {e}") + time.sleep(1) # Prevent tight loop on error + + def _push_to_talk_loop(self): + """Background thread for push-to-talk listening.""" + if not keyboard: + logger.error("Push-to-talk requires the keyboard package") + return + + while self.running: + try: + # Wait for hotkey press + keyboard.wait(self.config.push_to_talk_key) + + if not self.running: + break + + logger.info("Push-to-talk key pressed") + if self.config.audio_feedback and self.engine: + self.engine.say("Listening") + self.engine.runAndWait() + + # Listen for command + with self.microphone as source: + logger.debug("Listening for command...") + audio = self.recognizer.listen( + source, + timeout=self.config.timeout, + phrase_time_limit=self.config.phrase_time_limit + ) + self._process_audio(audio) + except Exception as e: + logger.error(f"Error in push-to-talk loop: {e}") + time.sleep(1) # Prevent tight loop on error + + def _process_audio(self, audio): + """Process audio data and extract command.""" + try: + text = self.recognizer.recognize_google( + audio, + language=self.config.language + ) + + logger.info(f"Recognized: {text}") + + if self.config.audio_feedback and self.engine: + self.engine.say("Got it") + self.engine.runAndWait() + + # Add command to queue for processing + self.command_queue.put(text) + + # Call the callback function + self.command_callback(text) + + except sr.UnknownValueError: + logger.info("Could not understand audio") + if self.config.audio_feedback and self.engine: + self.engine.say("Sorry, I didn't catch that") + self.engine.runAndWait() + except sr.RequestError as e: + logger.error(f"Could not request results: {e}") + if self.config.audio_feedback and self.engine: + self.engine.say("Sorry, I couldn't process that") + self.engine.runAndWait() + + def get_next_command(self) -> Optional[str]: + """Get the next command from the queue, if available.""" + try: + return self.command_queue.get_nowait() + except queue.Empty: + return None + + +# Async wrapper for voice commands +class AsyncVoiceCommandManager: + """Async wrapper for voice command listener to integrate with CompUse.""" + + def __init__(self, config: Optional[VoiceCommandConfig] = None): + """Initialize the async voice command manager. + + Args: + config: Optional configuration for voice recognition + """ + self.config = config or VoiceCommandConfig() + self.command_queue = asyncio.Queue() + self.listener = None + + async def start(self): + """Start the voice command listener.""" + if self.listener: + logger.warning("Voice command listener is already running") + return + + # Create the listener with a callback that puts commands in the async queue + def command_callback(text): + asyncio.run_coroutine_threadsafe( + self.command_queue.put(text), + asyncio.get_event_loop() + ) + + self.listener = VoiceCommandListener(self.config, command_callback) + self.listener.start() + + async def stop(self): + """Stop the voice command listener.""" + if self.listener: + self.listener.stop() + self.listener = None + + async def get_next_command(self) -> str: + """Wait for and return the next voice command.""" + return await self.command_queue.get() + + def get_next_command_nowait(self) -> Optional[str]: + """Get the next command without waiting, returns None if no command available.""" + try: + return self.command_queue.get_nowait() + except asyncio.QueueEmpty: + return None + + +# Example usage +async def main(): + """Example usage of voice command manager.""" + # Create voice command manager with default config + manager = AsyncVoiceCommandManager() + + # Start listening + await manager.start() + + print("Voice command listener started. Say something!") + print("Press Ctrl+C to exit") + + try: + while True: + # Wait for next command + command = await manager.get_next_command() + print(f"Command received: {command}") + + # Process command (example) + if "exit" in command.lower() or "quit" in command.lower(): + print("Exit command received, stopping...") + break + except KeyboardInterrupt: + print("Interrupted by user") + finally: + # Stop listening + await manager.stop() + print("Voice command listener stopped") + + +if __name__ == "__main__": + # Set up logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + # Run the example + asyncio.run(main()) \ No newline at end of file