From f6a48e2bc568820774cffd2647ed41143f991e6d Mon Sep 17 00:00:00 2001
From: sementerleen <sementerleen@users.noreply.github.com>
Date: Fri, 10 Apr 2026 23:31:19 +0300
Subject: [PATCH] Add Windows port of Clicky

---
 windows/.env.example     |   1 +
 windows/.gitignore       |   6 +
 windows/main.py          | 531 +++++++++++++++++++++++++++++++++++++++
 windows/requirements.txt |  10 +
 4 files changed, 548 insertions(+)
 create mode 100644 windows/.env.example
 create mode 100644 windows/.gitignore
 create mode 100644 windows/main.py
 create mode 100644 windows/requirements.txt
diff --git a/windows/.env.example b/windows/.env.example
new file mode 100644
index 00000000..7437ef33
--- /dev/null
+++ b/windows/.env.example
@@ -0,0 +1 @@
+ANTHROPIC_API_KEY=your_api_key_here
diff --git a/windows/.gitignore b/windows/.gitignore
new file mode 100644
index 00000000..7d7cbd10
--- /dev/null
+++ b/windows/.gitignore
@@ -0,0 +1,6 @@
+.env
+__pycache__/
+*.pyc
+*.pyo
+*.tmp
+*.mp3
diff --git a/windows/main.py b/windows/main.py
new file mode 100644
index 00000000..e36bff3c
--- /dev/null
+++ b/windows/main.py
@@ -0,0 +1,531 @@
+import os
+import sys
+import asyncio
+import threading
+import tempfile
+import base64
+import io
+import re
+import time
+import random
+import tkinter as tk
+import subprocess
+
+import anthropic
+import edge_tts
+from faster_whisper import WhisperModel
+import sounddevice as sd
+import numpy as np
+from scipy.io.wavfile import write as wav_write
+import mss
+from PIL import Image
+import pyautogui
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pyautogui.FAILSAFE = True
+pyautogui.PAUSE = 0.0
+
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+SAMPLE_RATE = 16000
+MAX_AGENT_STEPS = 55
+TTS_VOICE = "en-US-AriaNeural"
+FILLERS = ["", "", "", "So, ", "Alright, ", "Okay, ", "Now, ", "Right, "]
+
+client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
+whisper_model = None
+
+# State machine:
+#   idle       — nothing running
+#   recording  — recording initial request
+#   running    — agent executing steps
+#   listening  — agent paused, recording new instruction
+STATE = "idle"
+
+stop_agent = False
+new_instruction = None          # set when user speaks mid-task
+new_instruction_ready = threading.Event()
+
+audio_chunks = []
+stop_audio_flag = threading.Event()
+_tts_proc = None
+
+root_window = None
+
+# ── Prompts ────────────────────────────────────────────────────
+PLAN_PROMPT = """You are Clicky, an AI screen tutor. English only.
+
+Look at the screenshot and the user's request. Write a short numbered plan — max 5 steps, plain English. This will be read aloud.
+
+Format:
+PLAN:
+1. ...
+2. ...
+"""
+
+STEP_PROMPT = """You are Clicky, an AI screen tutor. English only.
+
+You receive a fresh screenshot before every action. Decide the NEXT single action.
+
+RESPONSE FORMAT (always exactly this):
+EXPLAIN: <one natural sentence — what you see and what you will do>
+ACTION: [COMMAND]
+
+COMMANDS (coordinates from 1280x720 screenshot):
+- [CLICK x=N y=N]
+- [RIGHTCLICK x=N y=N]
+- [DBLCLICK x=N y=N]
+- [MOVE x=N y=N]
+- [TYPE text="hello"]
+- [KEY key="enter"]
+- [WAIT ms=800]
+- [SHOWDESKTOP]
+- [DONE]
+
+RULES:
+1. ONE action only.
+2. GUI + MOUSE only. No terminal, no PowerShell.
+3. Do not close or minimize windows unnecessarily.
+4. After RIGHTCLICK: next action must MOVE to a menu item.
+5. [DONE] when fully complete."""
+
+RECONFIG_PROMPT = """You are Clicky, an AI screen tutor. English only.
+
+The user interrupted with a new instruction while you were working. Look at the current screenshot and the new instruction. Decide the NEXT single action — continuing from the current screen state, adjusting to what the user asked.
+
+New instruction: {instruction}
+
+RESPONSE FORMAT:
+EXPLAIN: <one sentence: what you will do next based on new instruction>
+ACTION: [COMMAND]
+
+Same COMMANDS as before. [DONE] if finished."""
+
+
+# ── TTS ────────────────────────────────────────────────────────
+def speak(text: str):
+    global _tts_proc
+    if stop_agent or not text.strip():
+        return
+    safe = text.replace("'", " ").replace('"', " ")
+    full = random.choice(FILLERS) + safe
+    try:
+        tmp = tempfile.mktemp(suffix=".mp3")
+        asyncio.run(edge_tts.Communicate(full, voice=TTS_VOICE).save(tmp))
+        _tts_proc = subprocess.Popen(
+            ["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", tmp],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        )
+        _tts_proc.wait()
+        try: os.unlink(tmp)
+        except: pass
+    except Exception as e:
+        log(f"[TTS error] {e}")
+
+
+def stop_tts():
+    global _tts_proc
+    if _tts_proc and _tts_proc.poll() is None:
+        _tts_proc.kill()
+
+
+# ── Whisper ─────────────────────────────────────────────────────
+def load_whisper():
+    global whisper_model
+    if whisper_model is None:
+        log("Loading Whisper...")
+        whisper_model = WhisperModel("base", device="cpu", compute_type="int8")
+        log("Whisper ready!")
+
+
+def record_until_stopped() -> np.ndarray:
+    chunks = []
+    stop_audio_flag.clear()
+    def cb(indata, frames, t, status):
+        chunks.append(indata.copy())
+    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="int16", callback=cb):
+        stop_audio_flag.wait(timeout=120)
+    if not chunks:
+        return np.zeros((0,1), dtype="int16")
+    return np.concatenate(chunks, axis=0)
+
+
+def transcribe(audio: np.ndarray) -> str:
+    load_whisper()
+    if audio.shape[0] == 0:
+        return ""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        wav_write(f.name, SAMPLE_RATE, audio)
+        path = f.name
+    segs, _ = whisper_model.transcribe(path, language="en")
+    os.unlink(path)
+    return " ".join(s.text for s in segs).strip()
+
+
+# ── Screen ──────────────────────────────────────────────────────
+def capture_screen():
+    with mss.mss() as sct:
+        mon = sct.monitors[1]
+        shot = sct.grab(mon)
+        ow, oh = shot.width, shot.height
+        img = Image.frombytes("RGB", shot.size, shot.bgra, "raw", "BGRX")
+        img.thumbnail((1280, 720))
+        sw, sh = img.size
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        b64 = base64.standard_b64encode(buf.getvalue()).decode()
+        return b64, (ow, oh), (sw, sh)
+
+
+def scale(x, y, orig, scaled):
+    return int(x*orig[0]/scaled[0]), int(y*orig[1]/scaled[1])
+
+
+# ── Mouse ───────────────────────────────────────────────────────
+def run_action(cmd, args, orig, scaled):
+    try:
+        if cmd == "SHOWDESKTOP":
+            log("[Show desktop]")
+            pyautogui.hotkey("win", "d")
+            time.sleep(1.2)
+        elif cmd in ("CLICK", "RIGHTCLICK", "DBLCLICK"):
+            rx, ry = scale(int(args["x"]), int(args["y"]), orig, scaled)
+            pyautogui.moveTo(rx, ry, duration=0.55, tween=pyautogui.easeInOutQuad)
+            time.sleep(0.15)
+            if cmd == "CLICK":
+                log(f"[Click] ({rx},{ry})")
+                pyautogui.click(); time.sleep(0.2)
+            elif cmd == "RIGHTCLICK":
+                log(f"[Right-click] ({rx},{ry})")
+                pyautogui.rightClick(); time.sleep(1.2)
+            elif cmd == "DBLCLICK":
+                log(f"[Double-click] ({rx},{ry})")
+                pyautogui.doubleClick(); time.sleep(0.3)
+        elif cmd == "MOVE":
+            rx, ry = scale(int(args["x"]), int(args["y"]), orig, scaled)
+            log(f"[Move] ({rx},{ry})")
+            pyautogui.moveTo(rx, ry, duration=0.45, tween=pyautogui.easeInOutQuad)
+        elif cmd == "TYPE":
+            log(f"[Type] {args.get('text','')}")
+            pyautogui.write(args.get("text",""), interval=0.05)
+        elif cmd == "KEY":
+            k = args.get("key","")
+            log(f"[Key] {k}")
+            pyautogui.hotkey(*k.split("+")) if "+" in k else pyautogui.press(k)
+        elif cmd == "WAIT":
+            ms = max(int(args.get("ms",500)), 200)
+            log(f"[Wait] {ms}ms"); time.sleep(ms/1000)
+    except Exception as e:
+        log(f"[Error] {cmd}: {e}")
+
+
+# ── Claude ──────────────────────────────────────────────────────
+def call_claude(messages, system, max_tokens=200):
+    r = client.messages.create(
+        model="claude-opus-4-6", max_tokens=max_tokens,
+        system=system, messages=messages,
+    )
+    return r.content[0].text
+
+
+def parse_response(resp):
+    explain, cmd_name, args, is_done = "", None, {}, False
+    m = re.search(r'EXPLAIN:\s*(.+)', resp)
+    if m: explain = m.group(1).strip()
+    a = re.search(r'ACTION:\s*\[(\w+)\s*([^\]]*)\]', resp)
+    if a:
+        cmd_name = a.group(1)
+        for kv in re.finditer(r'(\w+)=(?:"([^"]*)"|([\d]+))', a.group(2)):
+            args[kv.group(1)] = kv.group(2) if kv.group(2) else kv.group(3)
+        if cmd_name == "DONE":
+            is_done = True
+    return explain, cmd_name, args, is_done
+
+
+# ── Button handler ──────────────────────────────────────────────
+def on_record_btn():
+    global STATE
+    if STATE == "idle":
+        _start_recording()
+    elif STATE == "recording":
+        _stop_recording()
+    elif STATE == "running":
+        _pause_and_listen()
+    elif STATE == "listening":
+        _finish_listening()
+
+
+def _set_state(s):
+    global STATE
+    STATE = s
+    colors = {"idle": "#3B82F6", "recording": "#F59E0B",
+              "running": "#10B981", "listening": "#8B5CF6"}
+    btn_record.config(bg=colors[s], text="Listen")
+    btn_stop.config(state="normal" if s in ("running","listening") else "disabled")
+
+
+def _start_recording():
+    _set_state("recording")
+    threading.Thread(target=_recording_thread, daemon=True).start()
+
+
+def _stop_recording():
+    stop_audio_flag.set()
+    # thread will pick it up
+
+
+def _pause_and_listen():
+    global new_instruction
+    stop_tts()                      # kill speech immediately
+    new_instruction = None
+    new_instruction_ready.clear()
+    _set_state("listening")
+    log("Listening...")
+    threading.Thread(target=_listen_thread, daemon=True).start()
+
+
+def _finish_listening():
+    stop_audio_flag.set()
+
+
+def on_stop():
+    global stop_agent
+    stop_agent = True
+    stop_tts()
+    stop_audio_flag.set()
+    new_instruction_ready.set()
+    log("Stopped.")
+
+
+# ── Threads ──────────────────────────────────────────────────────
+def _recording_thread():
+    audio = record_until_stopped()
+    _set_state("running")
+    threading.Thread(target=_agent_thread, args=(audio,), daemon=True).start()
+
+
+def _listen_thread():
+    global new_instruction
+    audio = record_until_stopped()
+    text = transcribe(audio)
+    new_instruction = text
+    log(f"New instruction: {text}")
+    new_instruction_ready.set()
+    _set_state("running")
+
+
+# ── Agent ────────────────────────────────────────────────────────
+def interrupted():
+    """True if user clicked Record (listen) or Stop mid-task."""
+    return STATE == "listening" or stop_agent
+
+
+def wait_for_listen_and_reconfig(messages, task):
+    """Block until user finishes speaking, speak back confirmation + new plan, return updated task."""
+    global new_instruction
+    new_instruction_ready.wait(timeout=60)
+    new_instruction_ready.clear()
+    if stop_agent:
+        return task
+    if not new_instruction:
+        speak("I didn't catch that. Let me continue with what I was doing.")
+        return task
+
+    updated_task = new_instruction
+    new_instruction = None
+    log(f"New instruction: {updated_task}")
+
+    # Speak back confirmation
+    speak(f"Got it. You said: {updated_task}")
+    if stop_agent: return updated_task
+
+    # Take fresh screenshot and make a new plan
+    log("Replanning...")
+    b64, orig, scaled = capture_screen()
+    if stop_agent: return updated_task
+
+    plan_resp = call_claude(
+        [{"role": "user", "content": [
+            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": b64}},
+            {"type": "text", "text": f"New instruction from user: {updated_task}"},
+        ]}],
+        system=PLAN_PROMPT,
+    )
+    log(f"New plan:\n{plan_resp}")
+    plan_text = re.sub(r'PLAN:\s*', '', plan_resp).strip()
+    if plan_text and not stop_agent:
+        speak(f"Here's my new plan. {plan_text}")
+
+    return updated_task
+
+
+def _agent_thread(audio: np.ndarray):
+    global stop_agent, new_instruction
+
+    stop_agent = False
+    new_instruction = None
+
+    try:
+        user_text = transcribe(audio)
+        if not user_text:
+            log("No speech detected.")
+            return
+        log(f"You: {user_text}")
+
+        if interrupted(): return
+
+        # Plan
+        log("Planning...")
+        b64, orig, scaled = capture_screen()
+        if interrupted(): return
+
+        plan_resp = call_claude(
+            [{"role":"user","content":[
+                {"type":"image","source":{"type":"base64","media_type":"image/png","data":b64}},
+                {"type":"text","text":f"User request: {user_text}"},
+            ]}],
+            system=PLAN_PROMPT,
+        )
+        if interrupted(): return
+
+        log(f"Plan:\n{plan_resp}")
+        plan_text = re.sub(r'PLAN:\s*', '', plan_resp).strip()
+        if plan_text:
+            speak(f"Here's my plan. {plan_text}")
+
+        if interrupted(): return
+
+        messages = []
+        task = user_text
+        step = 0
+
+        while step < MAX_AGENT_STEPS:
+
+            # ── Immediate interrupt check ─────────────────────
+            if stop_agent:
+                break
+
+            if STATE == "listening":
+                task = wait_for_listen_and_reconfig(messages, task)
+                if stop_agent: break
+                # Reset to fresh context with new task
+                messages = []
+                _set_state("running")
+                continue  # restart loop with new task
+
+            step += 1
+            log(f"--- Step {step} ---")
+
+            time.sleep(0.3)
+            if interrupted(): break
+
+            b64, orig, scaled = capture_screen()
+            if interrupted(): break
+
+            content = [
+                {"type":"image","source":{"type":"base64","media_type":"image/png","data":b64}},
+                {"type":"text","text":f"Task: {task}" if step==1 else "Current screen. Next action?"},
+            ]
+            messages.append({"role":"user","content":content})
+
+            log("Thinking...")
+            resp = call_claude(messages, system=STEP_PROMPT)
+            messages.append({"role":"assistant","content":resp})
+
+            if interrupted(): break
+
+            explain, cmd_name, args, is_done = parse_response(resp)
+            log(f"  {explain}")
+
+            if explain:
+                speak(explain)      # stop_tts() will cut this short if interrupted
+
+            if interrupted(): break
+
+            if is_done or not cmd_name:
+                speak("Done! The task is complete.")
+                log("Done!")
+                break
+
+            run_action(cmd_name, args, orig, scaled)
+
+            if interrupted(): break
+
+        if step >= MAX_AGENT_STEPS and not stop_agent:
+            speak("I've done my best. Let me know if you need more.")
+
+    except Exception as e:
+        log(f"Error: {e}")
+    finally:
+        _set_state("idle")
+        btn_stop.config(state="disabled")
+
+
+# ── UI ────────────────────────────────────────────────────────────
+def log(msg: str):
+    text_log.config(state="normal")
+    text_log.insert("end", msg + "\n")
+    text_log.see("end")
+    text_log.config(state="disabled")
+    print(msg)
+
+
+def main():
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: ANTHROPIC_API_KEY not found.")
+        sys.exit(1)
+
+    global btn_record, btn_stop, text_log, root_window
+
+    root = tk.Tk()
+    root_window = root
+    root.title("Clicky - AI Assistant")
+    root.geometry("440x340")
+    root.resizable(False, False)
+    root.attributes("-topmost", True)
+
+    tk.Label(root, text="Clicky", font=("Segoe UI", 18, "bold"), fg="#3B82F6").pack(pady=(14,2))
+    tk.Label(root, text="AI Screen Assistant", font=("Segoe UI", 10), fg="#6B7280").pack()
+
+    bf = tk.Frame(root); bf.pack(pady=12)
+
+    btn_record = tk.Button(
+        bf, text="Listen",
+        # Blue=idle, Yellow=recording, Green=running, Purple=listening-for-instruction
+        font=("Segoe UI", 12, "bold"),
+        bg="#3B82F6", fg="white",
+        activebackground="#2563EB", activeforeground="white",
+        relief="flat", padx=22, pady=10,
+        cursor="hand2", command=on_record_btn,
+    )
+    btn_record.pack(side="left", padx=6)
+
+    btn_stop = tk.Button(
+        bf, text="Stop",
+        font=("Segoe UI", 12, "bold"),
+        bg="#EF4444", fg="white",
+        activebackground="#DC2626", activeforeground="white",
+        relief="flat", padx=22, pady=10,
+        cursor="hand2", state="disabled",
+        command=on_stop,
+    )
+    btn_stop.pack(side="left", padx=6)
+
+    tk.Label(root, text="Log:", font=("Segoe UI", 9), fg="#6B7280").pack(anchor="w", padx=16)
+    text_log = tk.Text(
+        root, height=11, font=("Consolas", 9),
+        bg="#F9FAFB", fg="#111827",
+        state="disabled", relief="flat", bd=0,
+    )
+    text_log.pack(fill="both", padx=16, pady=(0,14))
+
+    log("Clicky ready.")
+    log("  Blue=idle  Yellow=recording  Green=running  Purple=listening")
+    log("  Click Listen anytime — it always listens to you.")
+    threading.Thread(target=load_whisper, daemon=True).start()
+    root.mainloop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/windows/requirements.txt b/windows/requirements.txt
new file mode 100644
index 00000000..415030ca
--- /dev/null
+++ b/windows/requirements.txt
@@ -0,0 +1,10 @@
+anthropic
+openai-whisper
+pyttsx3
+mss
+sounddevice
+numpy
+scipy
+Pillow
+pystray
+python-dotenv