From 25f21c18e4fcb938ee953b7890b4f6dbdf7fbc2d Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Tue, 14 Apr 2026 20:04:15 +0530 Subject: [PATCH 01/81] feat: add OCR and video analysis actions (#155) --- agent_core/core/impl/action/router.py | 2 + agent_core/core/impl/vlm/interface.py | 173 +++++ agent_core/core/llm/google_gemini_client.py | 69 ++ agent_core/decorators/log_events.py | 1 + agent_core/decorators/profiler.py | 1 + app/data/action/perform_ocr.py | 82 +++ app/data/action/understand_video.py | 92 +++ app/internal_action_interface.py | 65 ++ requirements.txt | 1 + tests/test_step1_vlm_interface.py | 563 ++++++++++++++++ tests/test_step2_iai_methods.py | 76 +++ tests/test_step2_internal_action_interface.py | 599 ++++++++++++++++++ tests/test_step3_perform_ocr_action.py | 129 ++++ tests/test_step4_understand_video_action.py | 116 ++++ 14 files changed, 1969 insertions(+) create mode 100644 app/data/action/perform_ocr.py create mode 100644 app/data/action/understand_video.py create mode 100644 tests/test_step1_vlm_interface.py create mode 100644 tests/test_step2_iai_methods.py create mode 100644 tests/test_step2_internal_action_interface.py create mode 100644 tests/test_step3_perform_ocr_action.py create mode 100644 tests/test_step4_understand_video_action.py diff --git a/agent_core/core/impl/action/router.py b/agent_core/core/impl/action/router.py index 12f1fef9..210c2458 100644 --- a/agent_core/core/impl/action/router.py +++ b/agent_core/core/impl/action/router.py @@ -6,6 +6,8 @@ based on user queries using LLM reasoning. """ +from __future__ import annotations + import json import ast from typing import Optional, List, Dict, Any, Tuple diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index dce58675..455de4af 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -286,6 +286,112 @@ async def generate_response_async( log_response, ) + def describe_image_ocr( + self, + image_path: str, + user_prompt: str | None = None, + ) -> str: + """ + Run OCR on an image. Returns raw extracted text, not a description. + Uses a structured extraction system prompt regardless of provider. + """ + if not os.path.isfile(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + + with open(image_path, "rb") as f: + image_bytes = f.read() + + system_prompt = ( + "You are a precise OCR engine. Extract ALL text from this image exactly as it appears. " + "Preserve line breaks, indentation, and formatting. " + "Do NOT add commentary, interpretation, or markdown. " + "Output only the raw extracted text. If no text is present, output an empty string." + ) + effective_user = user_prompt or "Extract all text from this image." + + logger.info(f"[LLM SEND] OCR request | path={image_path}") + + if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): + response = self._openai_describe_bytes_plain(image_bytes, system_prompt, effective_user) + elif self.provider == "remote": + response = self._ollama_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "gemini": + response = self._gemini_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "byteplus": + response = self._byteplus_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "anthropic": + response = self._anthropic_describe_bytes(image_bytes, system_prompt, effective_user) + else: + raise RuntimeError(f"Unknown provider {self.provider!r}") + + cleaned = re.sub(self._CODE_BLOCK_RE, "", response.get("content", "").strip()) + + tokens_used = response.get("tokens_used", 0) + if tokens_used: + self._set_token_count(self._get_token_count() + tokens_used) + + logger.info(f"[LLM RECV OCR] {cleaned[:120]}...") + return cleaned + + def describe_video_frames( + self, + video_path: str, + query: str | None = None, + max_frames: int = 8, + ) -> str: + """ + Analyse video by extracting evenly-spaced keyframes and sending to VLM. + Falls back to graceful error if OpenCV is unavailable. + """ + try: + import cv2 + except ImportError: + raise RuntimeError( + "opencv-python-headless is required for video analysis. " + "Install with: pip install opencv-python-headless" + ) + + if not os.path.isfile(video_path): + raise FileNotFoundError(f"Video file not found: {video_path}") + + cap = cv2.VideoCapture(video_path) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + if total_frames == 0: + cap.release() + raise ValueError("Video has 0 frames or could not be read.") + + indices = [int(i * total_frames / max_frames) for i in range(max_frames)] + frame_bytes_list: list[bytes] = [] + + for idx in indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, idx) + ret, frame = cap.read() + if ret: + success, buf = cv2.imencode(".jpg", frame) + if success: + frame_bytes_list.append(buf.tobytes()) + cap.release() + + if not frame_bytes_list: + raise ValueError("Could not extract any frames from the video.") + + system_prompt = ( + f"You are analysing a video represented by {len(frame_bytes_list)} evenly-spaced keyframes. " + "Provide: 1) An overall narrative summary of what is happening, " + "2) Any visible text or titles, " + "3) Key objects, people, or scenes, " + "4) Notable transitions between frames." + ) + effective_user = query or "Summarise the content of this video." + + # For multi-frame, send frames sequentially (all providers support single-image per call) + # Gemini 1.5 Pro supports native multi-image; others receive concatenated descriptions + if self.provider == "gemini" and len(frame_bytes_list) > 1: + return self._gemini_describe_video_frames(frame_bytes_list, system_prompt, effective_user) + else: + # Universal fallback: describe each frame, then synthesise + return self._multi_frame_describe_fallback(frame_bytes_list, system_prompt, effective_user) + # ───────────────────── Provider Helpers ───────────────────── def _report_usage_async( @@ -317,6 +423,73 @@ def _report_usage_async( except Exception as e: logger.warning(f"[VLM] Failed to report usage: {e}") + def _openai_describe_bytes_plain(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: + """OpenAI vision request WITHOUT json_object enforcement — for raw text output (OCR).""" + img_b64 = base64.b64encode(image_bytes).decode() + messages: list[Dict[str, Any]] = [] + if sys: + messages.append({"role": "system", "content": sys}) + messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": usr}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + ], + }) + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=self.temperature, + max_tokens=4096, # OCR may return large amounts of text + # NOTE: No response_format — OCR returns plain text + ) + content = response.choices[0].message.content.strip() + total_tokens = response.usage.prompt_tokens + response.usage.completion_tokens + return {"tokens_used": total_tokens, "content": content} + + def _gemini_describe_video_frames( + self, frame_bytes_list: list[bytes], sys: str | None, usr: str + ) -> str: + """Gemini-specific multi-image frame analysis in a single API call.""" + result = self._gemini_client.generate_multimodal_multi_image( + self.model, + text=usr, + image_bytes_list=frame_bytes_list, + system_prompt=sys, + temperature=self.temperature, + json_mode=False, + ) + tokens_used = result.get("tokens_used", 0) + if tokens_used: + self._set_token_count(self._get_token_count() + tokens_used) + return re.sub(self._CODE_BLOCK_RE, "", result.get("content", "").strip()) + + def _multi_frame_describe_fallback( + self, frame_bytes_list: list[bytes], system_prompt: str, user_prompt: str + ) -> str: + """Describe each frame individually, then synthesise into a narrative.""" + frame_descriptions = [] + for i, fb in enumerate(frame_bytes_list): + desc = self.describe_image_bytes( + fb, + system_prompt=f"Frame {i+1} of {len(frame_bytes_list)}: Describe what you see.", + user_prompt=user_prompt, + log_response=False, + ) + frame_descriptions.append(f"[Frame {i+1}]: {desc}") + + synthesis_prompt = ( + "You received descriptions of video keyframes. Write a coherent video summary:\n\n" + + "\n".join(frame_descriptions) + ) + synthesis = self.describe_image_bytes( + frame_bytes_list[-1], # anchor with last frame for context + system_prompt=system_prompt, + user_prompt=synthesis_prompt, + log_response=True, + ) + return synthesis + def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: """OpenAI vision request with automatic prompt caching metrics.""" img_b64 = base64.b64encode(image_bytes).decode() diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index f6d1688b..3cbffe44 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -236,6 +236,75 @@ def generate_multimodal( "cached_tokens": cached_tokens, } + def generate_multimodal_multi_image( + self, + model: str, + *, + text: str, + image_bytes_list: List[bytes], + system_prompt: Optional[str] = None, + temperature: Optional[float] = None, + json_mode: bool = False, + ) -> Dict[str, Any]: + """Generate text from a prompt that contains multiple inline images. + + Args: + model: Model identifier + text: The text prompt + image_bytes_list: List of PNG/JPEG image data + system_prompt: Optional system instruction + temperature: Sampling temperature + json_mode: If True, enforce JSON output format + + Returns: + Dict with generation results and token counts + """ + parts: List[Dict[str, Any]] = [{"text": text}] + + for image_bytes in image_bytes_list: + inline_data = { + "mimeType": "image/jpeg", + "data": base64.b64encode(image_bytes).decode("utf-8"), + } + parts.append({"inlineData": inline_data}) + + contents = [{"role": "user", "parts": parts}] + + payload: Dict[str, Any] = {"contents": contents} + if system_prompt: + payload["systemInstruction"] = { + "parts": [{"text": system_prompt}], + } + + generation_config: Dict[str, Any] = {} + if temperature is not None: + generation_config["temperature"] = temperature + if json_mode: + generation_config["responseMimeType"] = "application/json" + if generation_config: + payload["generationConfig"] = generation_config + + response = self._post_json( + f"{_normalise_model_name(model)}:generateContent", payload + ) + + # Extract token usage from usageMetadata + usage_metadata = response.get("usageMetadata", {}) + total_tokens = usage_metadata.get("totalTokenCount", 0) + prompt_tokens = usage_metadata.get("promptTokenCount", 0) + completion_tokens = usage_metadata.get("candidatesTokenCount", 0) + cached_tokens = usage_metadata.get("cachedContentTokenCount", 0) + + content = self._extract_text(response) + + return { + "tokens_used": total_tokens, + "content": content, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "cached_tokens": cached_tokens, + } + def embed_text(self, model: str, *, text: str) -> List[float]: """Fetch an embedding vector for the supplied text. diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index ab9a7cfe..3e6d1571 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import annotations """ Flexible function-level logging: - logs start diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 38e5e77c..78fc4f5b 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import annotations """ Profiler Module - Comprehensive performance tracking for the agent. diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py new file mode 100644 index 00000000..3c1d01d9 --- /dev/null +++ b/app/data/action/perform_ocr.py @@ -0,0 +1,82 @@ +from agent_core import action + +@action( + name="perform_ocr", + description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", + mode="CLI", + action_sets=["document_processing, image"], + input_schema={ + "image_path": { + "type": "string", + "example": "C:\\Users\\user\\Pictures\\receipt.jpg", + "description": "Absolute path to the image file containing text to extract." + }, + "user_prompt": { + "type": "string", + "example": "Extract all text including prices and product names.", + "description": "Optional: extra instruction to guide the OCR (e.g. focus on specific regions or text types)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if OCR completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "OCR complete: 42 lines, 1250 characters extracted.", + "description": "Brief summary of extraction results." + }, + "file_path": { + "type": "string", + "example": "/workspace/ocr_result_20260414_153000.txt", + "description": "Absolute path to the .txt file containing full extracted text." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the extracted text was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "image_path": "C:\\Users\\user\\Pictures\\sample.jpg", + "user_prompt": "Extract all visible text.", + "simulated_mode": True + } +) +def perform_ocr(input_data: dict) -> dict: + import os + + image_path = str(input_data.get('image_path', '')).strip() + user_prompt = str(input_data.get('user_prompt', '')).strip() or None + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'OCR complete: 5 lines, 120 characters extracted.', + 'file_path': '/workspace/ocr_result_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not image_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'image_path is required.'} + + if not os.path.isfile(image_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.perform_ocr(image_path, user_prompt=user_prompt) + return {**result, 'message': ''} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + +execute = perform_ocr diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py new file mode 100644 index 00000000..d40b4dfb --- /dev/null +++ b/app/data/action/understand_video.py @@ -0,0 +1,92 @@ +from agent_core import action + +@action( + name="understand_video", + description="Analyses a video file by sampling keyframes and generating a narrative summary using a Vision Language Model. Use when the user shares a video and wants to know what happens in it, extract visible text, or answer a specific question about video content.", + mode="CLI", + action_sets=["document_processing, image"], + input_schema={ + "video_path": { + "type": "string", + "example": "C:\\Users\\user\\Videos\\meeting.mp4", + "description": "Absolute path to the video file (MP4, AVI, MOV supported)." + }, + "query": { + "type": "string", + "example": "What is being presented on the slides?", + "description": "Optional: specific question to answer about the video." + }, + "max_frames": { + "type": "integer", + "example": 8, + "description": "Number of evenly-spaced keyframes to sample (default: 8, max recommended: 16)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if analysis completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "The video shows a person presenting slides about quarterly sales...", + "description": "First 500 characters of the video summary. Full summary saved to file." + }, + "file_path": { + "type": "string", + "example": "/workspace/video_summary_20260414_153000.txt", + "description": "Absolute path to the .txt file containing the full video summary." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the full summary was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "video_path": "C:\\Users\\user\\Videos\\sample.mp4", + "query": "Summarise the video content.", + "max_frames": 8, + "simulated_mode": True + } +) +def understand_video(input_data: dict) -> dict: + import os + + video_path = str(input_data.get('video_path', '')).strip() + query = str(input_data.get('query', '')).strip() or None + max_frames = int(input_data.get('max_frames', 8)) + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'The video shows a simulated presentation with 3 speakers.', + 'file_path': '/workspace/video_summary_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not video_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'video_path is required.'} + + if not os.path.isfile(video_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.understand_video(video_path, query=query, max_frames=max_frames) + return {**result, 'message': ''} + except RuntimeError as e: + # Catches missing opencv gracefully + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + +execute = understand_video diff --git a/app/internal_action_interface.py b/app/internal_action_interface.py index a1486f1b..45cb7c8a 100644 --- a/app/internal_action_interface.py +++ b/app/internal_action_interface.py @@ -5,6 +5,8 @@ framework internal functions. """ +from __future__ import annotations + from typing import Dict, Any, Optional, List, TYPE_CHECKING from app.llm import LLMInterface, LLMCallType from app.vlm_interface import VLMInterface @@ -98,6 +100,69 @@ def describe_image(cls, image_path: str, prompt: Optional[str] = None) -> str: raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") return cls.vlm_interface.describe_image(image_path, user_prompt=prompt) + @classmethod + def perform_ocr(cls, image_path: str, user_prompt: Optional[str] = None) -> dict: + """ + Run OCR on an image and persist the extracted text to workspace. + Returns a concise status dict + saved file path to avoid TUI flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + raw_text = cls.vlm_interface.describe_image_ocr(image_path, user_prompt=user_prompt) + + # Persist to workspace to prevent token ballooning in the agent context + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"ocr_result_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(raw_text) + + line_count = raw_text.count("\n") + 1 + char_count = len(raw_text) + return { + "status": "success", + "summary": f"OCR complete: {line_count} lines, {char_count} characters extracted.", + "text": raw_text, + "file_path": out_path, + "file_saved": True, + } + + @classmethod + def understand_video( + cls, + video_path: str, + query: Optional[str] = None, + max_frames: int = 8, + ) -> dict: + """ + Analyse a video by extracting keyframes and querying the VLM. + Persists the summary to workspace to avoid TUI/context flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + summary = cls.vlm_interface.describe_video_frames( + video_path, query=query, max_frames=max_frames + ) + + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(summary) + + return { + "status": "success", + "summary": summary[:500] + ("..." if len(summary) > 500 else ""), + "file_path": out_path, + "file_saved": True, + } + # ───────────────── Memory Search ───────────────── @classmethod diff --git a/requirements.txt b/requirements.txt index bd6fdd9f..53eda7dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,4 @@ watchdog telethon croniter>=2.0.0 # Cron expression parsing for scheduler playwright # WhatsApp Web browser automation +opencv-python-headless # Video analysis keyframe extraction diff --git a/tests/test_step1_vlm_interface.py b/tests/test_step1_vlm_interface.py new file mode 100644 index 00000000..c1bf516f --- /dev/null +++ b/tests/test_step1_vlm_interface.py @@ -0,0 +1,563 @@ +# -*- coding: utf-8 -*- +""" +Step 1 Verification Suite — VLM Interface Extensions +Tests for: describe_image_ocr, describe_video_frames, _openai_describe_bytes_plain, + _gemini_describe_video_frames, _multi_frame_describe_fallback, + GeminiClient.generate_multimodal_multi_image + +Run with: + python -m pytest tests/test_step1_vlm_interface.py -v + +ALL tests must pass. Zero real API calls are made. +Zero imports of app.* are required — only agent_core. +""" + +from __future__ import annotations + +import base64 +import io +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch, call + +# ───────────────────────────────────────────────────────────────── +# SECTION A: GeminiClient.generate_multimodal_multi_image +# ───────────────────────────────────────────────────────────────── + +class TestGeminiClientMultiImage(unittest.TestCase): + """ + VERIFY: GeminiClient.generate_multimodal_multi_image exists and + constructs the correct payload (one inlineData part per frame). + """ + + def _make_client(self): + from agent_core.core.llm.google_gemini_client import GeminiClient + client = GeminiClient.__new__(GeminiClient) + client._api_key = "fake-key" + client._api_base = "https://generativelanguage.googleapis.com" + client._api_version = "v1beta" + client._timeout = 30 + return client + + def test_method_exists(self): + """generate_multimodal_multi_image must exist on GeminiClient.""" + from agent_core.core.llm.google_gemini_client import GeminiClient + self.assertTrue( + hasattr(GeminiClient, "generate_multimodal_multi_image"), + "FAIL: GeminiClient.generate_multimodal_multi_image not found. " + "Add it to agent_core/core/llm/google_gemini_client.py" + ) + + def test_payload_contains_multiple_inline_data_parts(self): + """The API payload must contain one inlineData entry per frame passed in.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "video summary"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 100, "promptTokenCount": 80, "candidatesTokenCount": 20}, + } + + captured_payload = {} + + def fake_post(path, payload): + captured_payload.update(payload) + return fake_response + + client._post_json = fake_post + + frame_bytes = [b"frame1_bytes", b"frame2_bytes", b"frame3_bytes"] + result = client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="What is happening?", + image_bytes_list=frame_bytes, + system_prompt="Analyse these frames.", + temperature=0.5, + json_mode=False, + ) + + # Assert return shape + self.assertIn("content", result) + self.assertIn("tokens_used", result) + self.assertEqual(result["content"], "video summary") + + # Assert payload structure: must have text part + 3 inlineData parts + parts = captured_payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + text_parts = [p for p in parts if "text" in p] + + self.assertEqual(len(inline_parts), 3, + f"Expected 3 inlineData parts, got {len(inline_parts)}") + self.assertEqual(len(text_parts), 1, + f"Expected 1 text part, got {len(text_parts)}") + + # Assert each frame is correctly base64-encoded in the payload + for i, (part, raw) in enumerate(zip(inline_parts, frame_bytes)): + expected_b64 = base64.b64encode(raw).decode() + actual_b64 = part["inlineData"]["data"] + self.assertEqual(actual_b64, expected_b64, + f"Frame {i+1}: base64 mismatch in payload") + + def test_system_prompt_is_included(self): + """systemInstruction must be present in payload when system_prompt is given.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2}, + } + captured = {} + client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] + + client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="Describe", + image_bytes_list=[b"img"], + system_prompt="You are an expert.", + ) + self.assertIn("systemInstruction", captured, + "FAIL: systemInstruction missing from payload when system_prompt is provided") + + def test_no_system_prompt_omits_key(self): + """systemInstruction must be absent when system_prompt is None.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 5}, + } + captured = {} + client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] + + client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="Describe", + image_bytes_list=[b"img"], + system_prompt=None, + ) + self.assertNotIn("systemInstruction", captured, + "FAIL: systemInstruction should be absent when no system_prompt is given") + + +# ───────────────────────────────────────────────────────────────── +# SECTION B: VLMInterface._openai_describe_bytes_plain +# ───────────────────────────────────────────────────────────────── + +class TestOpenAIDescribeBytesPlain(unittest.TestCase): + """ + VERIFY: _openai_describe_bytes_plain exists and does NOT set + response_format=json_object (that would break raw OCR text output). + """ + + def _make_vlm(self): + """Instantiate VLMInterface in deferred mode so no real API calls are made.""" + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": "openai", + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider="openai", deferred=True) + vlm.provider = "openai" + return vlm + + def test_method_exists(self): + """_openai_describe_bytes_plain must exist on VLMInterface.""" + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "_openai_describe_bytes_plain"), + "FAIL: _openai_describe_bytes_plain not found on VLMInterface. " + "Add it to agent_core/core/impl/vlm/interface.py" + ) + + def test_no_response_format_json_object(self): + """ + CRITICAL: _openai_describe_bytes_plain must NOT pass + response_format={'type': 'json_object'} to the OpenAI client. + OCR returns raw text — json_object enforces a JSON wrapper and breaks it. + """ + vlm = self._make_vlm() + + mock_choice = MagicMock() + mock_choice.message.content = "Hello World\nLine 2" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 50 + mock_response.usage.completion_tokens = 20 + + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm._openai_describe_bytes_plain(b"fake_image_bytes", "sys prompt", "Extract text") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + self.assertNotIn("response_format", call_kwargs, + "FAIL: response_format is present in _openai_describe_bytes_plain. " + "Remove it — OCR must return raw text, not JSON.") + + def test_returns_dict_with_content_and_tokens(self): + """Must return dict with 'content' and 'tokens_used' keys.""" + vlm = self._make_vlm() + + mock_choice = MagicMock() + mock_choice.message.content = "Extracted: Invoice #1234" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 40 + mock_response.usage.completion_tokens = 15 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + result = vlm._openai_describe_bytes_plain(b"img", None, "Extract text") + + self.assertIsInstance(result, dict) + self.assertIn("content", result) + self.assertIn("tokens_used", result) + self.assertEqual(result["content"], "Extracted: Invoice #1234") + self.assertEqual(result["tokens_used"], 55) + + def test_max_tokens_is_at_least_4096(self): + """ + OCR may produce large amounts of text. max_tokens must be >= 4096. + """ + vlm = self._make_vlm() + mock_choice = MagicMock() + mock_choice.message.content = "text" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm._openai_describe_bytes_plain(b"img", None, "Extract text") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + max_tokens = call_kwargs.get("max_tokens", call_kwargs.get("max_completion_tokens", 0)) + self.assertGreaterEqual(max_tokens, 4096, + f"FAIL: max_tokens={max_tokens}. OCR needs at least 4096 to handle large text blocks.") + + +# ───────────────────────────────────────────────────────────────── +# SECTION C: VLMInterface.describe_image_ocr +# ───────────────────────────────────────────────────────────────── + +class TestDescribeImageOcr(unittest.TestCase): + """ + VERIFY: describe_image_ocr exists, routes to the correct provider branch, + uses an OCR-specific system prompt, and handles FileNotFoundError. + """ + + def _make_vlm_patched(self, provider="openai"): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": provider, + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider=provider, deferred=True) + vlm.provider = provider + return vlm + + def test_method_exists(self): + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "describe_image_ocr"), + "FAIL: describe_image_ocr not found on VLMInterface. " + "Add it to agent_core/core/impl/vlm/interface.py" + ) + + def test_raises_file_not_found_for_missing_path(self): + """Must raise FileNotFoundError when the image path does not exist.""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_image_ocr("/nonexistent/path/image.png") + + def test_routes_to_plain_method_for_openai(self): + """ + For provider='openai', describe_image_ocr must call + _openai_describe_bytes_plain (not _openai_describe_bytes). + This ensures json_object response format is not applied. + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + vlm._openai_describe_bytes_plain = MagicMock( + return_value={"content": "INVOICE\nTotal: $100", "tokens_used": 30} + ) + vlm._openai_describe_bytes = MagicMock() + + result = vlm.describe_image_ocr(tmp_path) + + vlm._openai_describe_bytes_plain.assert_called_once() + vlm._openai_describe_bytes.assert_not_called() + self.assertEqual(result, "INVOICE\nTotal: $100") + finally: + os.unlink(tmp_path) + + def test_system_prompt_contains_ocr_keywords(self): + """ + The system prompt passed to the provider must contain OCR-specific + language ('OCR', 'extract', 'text') — not a generic description prompt. + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + captured_sys_prompt = {} + + def capture_plain(image_bytes, sys_prompt, user_prompt): + captured_sys_prompt["sys"] = sys_prompt or "" + return {"content": "Hello", "tokens_used": 10} + + vlm._openai_describe_bytes_plain = capture_plain + vlm.describe_image_ocr(tmp_path) + + sys_lower = captured_sys_prompt.get("sys", "").lower() + self.assertTrue( + "ocr" in sys_lower or "extract" in sys_lower or "text" in sys_lower, + f"FAIL: OCR system prompt does not mention OCR/extraction. Got: '{captured_sys_prompt.get('sys')}'" + ) + finally: + os.unlink(tmp_path) + + def test_returns_string(self): + """describe_image_ocr must return a string, not a dict.""" + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + vlm._openai_describe_bytes_plain = MagicMock( + return_value={"content": "TEXT FROM IMAGE", "tokens_used": 20} + ) + result = vlm.describe_image_ocr(tmp_path) + self.assertIsInstance(result, str) + finally: + os.unlink(tmp_path) + + +# ───────────────────────────────────────────────────────────────── +# SECTION D: VLMInterface.describe_video_frames +# ───────────────────────────────────────────────────────────────── + +class TestDescribeVideoFrames(unittest.TestCase): + """ + VERIFY: describe_video_frames exists, handles missing file, + handles missing opencv gracefully, and calls the correct + provider path (Gemini native vs. fallback). + """ + + def _make_vlm_patched(self, provider="openai"): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": provider, + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider=provider, deferred=True) + vlm.provider = provider + return vlm + + def test_method_exists(self): + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "describe_video_frames"), + "FAIL: describe_video_frames not found on VLMInterface." + ) + + def test_raises_file_not_found_for_missing_video(self): + """Must raise FileNotFoundError when the video path does not exist.""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_video_frames("/nonexistent/video.mp4") + + def test_raises_runtime_error_when_opencv_missing(self): + """ + When opencv is not installed, describe_video_frames must raise + a RuntimeError with an actionable install message — not an ImportError. + This ensures a clean error surface for the user. + """ + vlm = self._make_vlm_patched() + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + with patch.dict(sys.modules, {"cv2": None}): + with self.assertRaises(RuntimeError) as ctx: + vlm.describe_video_frames(tmp_path) + self.assertIn("opencv", str(ctx.exception).lower(), + "FAIL: RuntimeError message must mention 'opencv' to guide the user.") + finally: + os.unlink(tmp_path) + + def test_gemini_uses_native_multi_image_method(self): + """ + For provider='gemini', describe_video_frames must call + _gemini_describe_video_frames (native multi-image path). + It must NOT fall back to the sequential per-frame fallback. + """ + vlm = self._make_vlm_patched(provider="gemini") + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + mock_cv2 = MagicMock() + mock_cap = MagicMock() + mock_cap.get.return_value = 30.0 + mock_cap.read.return_value = (True, MagicMock()) + mock_cv2.VideoCapture.return_value = mock_cap + mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) + + vlm._gemini_describe_video_frames = MagicMock(return_value="Gemini video summary") + vlm._multi_frame_describe_fallback = MagicMock(return_value="fallback summary") + + with patch.dict(sys.modules, {"cv2": mock_cv2}): + result = vlm.describe_video_frames(tmp_path, max_frames=2) + + vlm._gemini_describe_video_frames.assert_called_once() + vlm._multi_frame_describe_fallback.assert_not_called() + self.assertEqual(result, "Gemini video summary") + finally: + os.unlink(tmp_path) + + def test_non_gemini_uses_fallback(self): + """ + For provider='openai', describe_video_frames must call + _multi_frame_describe_fallback (sequential frame path). + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + mock_cv2 = MagicMock() + mock_cap = MagicMock() + mock_cap.get.return_value = 30.0 + mock_cap.read.return_value = (True, MagicMock()) + mock_cv2.VideoCapture.return_value = mock_cap + mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) + + vlm._gemini_describe_video_frames = MagicMock(return_value="should not be called") + vlm._multi_frame_describe_fallback = MagicMock(return_value="OpenAI fallback summary") + + with patch.dict(sys.modules, {"cv2": mock_cv2}): + result = vlm.describe_video_frames(tmp_path, max_frames=2) + + vlm._multi_frame_describe_fallback.assert_called_once() + vlm._gemini_describe_video_frames.assert_not_called() + self.assertEqual(result, "OpenAI fallback summary") + finally: + os.unlink(tmp_path) + + +# ───────────────────────────────────────────────────────────────── +# SECTION E: Regression — existing describe_image still works +# ───────────────────────────────────────────────────────────────── + +class TestRegressionDescribeImage(unittest.TestCase): + """ + REGRESSION GUARD: Ensure existing describe_image and describe_image_bytes + are untouched and still produce the same output contract. + This confirms Step 1 did not break any existing functionality. + """ + + def _make_vlm_patched(self): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": "openai", + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider="openai", deferred=True) + vlm.provider = "openai" + return vlm + + def test_describe_image_still_raises_on_missing_file(self): + """describe_image must still raise FileNotFoundError (unchanged).""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_image("/does/not/exist.png") + + def test_describe_image_bytes_returns_string(self): + """describe_image_bytes must still return a plain string.""" + vlm = self._make_vlm_patched() + + mock_choice = MagicMock() + mock_choice.message.content = '{"content": "A cat"}' + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + result = vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this image.") + self.assertIsInstance(result, str) + + def test_describe_image_bytes_uses_json_response_format(self): + """ + REGRESSION: The ORIGINAL describe_image_bytes must still use + response_format=json_object (this is the existing contract). + It should NOT be affected by the plain-text OCR variant. + """ + vlm = self._make_vlm_patched() + + mock_choice = MagicMock() + mock_choice.message.content = '{"content": "A dog"}' + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this.") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + # Original describe_image_bytes should still request json_object + self.assertIn("response_format", call_kwargs, + "REGRESSION: describe_image_bytes lost response_format=json_object. " + "Only the new _openai_describe_bytes_plain should omit it.") + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_step2_iai_methods.py b/tests/test_step2_iai_methods.py new file mode 100644 index 00000000..415689eb --- /dev/null +++ b/tests/test_step2_iai_methods.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +import unittest +from unittest.mock import MagicMock, patch, mock_open +import os +from datetime import datetime +import asyncio + +# Mocking the constants before import if necessary, but app.config should be fine +import sys +from unittest.mock import PropertyMock + +class TestStep2InternalInterface(unittest.TestCase): + def setUp(self): + # We need to mock InternalActionInterface dependencies + self.iai_patcher = patch('app.internal_action_interface.InternalActionInterface', autospec=True) + # However, we want to test the ACTUAL methods on InternalActionInterface + # So we import it and patch its class attributes + + from app.internal_action_interface import InternalActionInterface + self.iai = InternalActionInterface + self.iai.vlm_interface = MagicMock() + self.iai.state_manager = MagicMock() + self.iai.ui_adapter = MagicMock() + + @patch('os.path.join', side_effect=lambda *args: "/".join(args)) + @patch('builtins.open', new_callable=mock_open) + @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") + def test_perform_ocr_saves_file_and_returns_dict(self, mock_file, mock_join): + # Setup + self.iai.vlm_interface.describe_image_ocr.return_value = "Extracted Text Content" + + # Execute + result = self.iai.perform_ocr("some_image.jpg", user_prompt="Test Prompt") + + # Verify call to VLM + self.iai.vlm_interface.describe_image_ocr.assert_called_once_with("some_image.jpg", user_prompt="Test Prompt") + + # Verify file saving + mock_file.assert_called_once() + handle = mock_file() + handle.write.assert_called_once_with("Extracted Text Content") + + # Verify return dict + self.assertEqual(result['status'], 'success') + self.assertTrue(result['file_saved']) + self.assertIn('ocr_result_', result['file_path']) + self.assertIn('OCR complete', result['summary']) + + @patch('os.path.join', side_effect=lambda *args: "/".join(args)) + @patch('builtins.open', new_callable=mock_open) + @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") + def test_understand_video_saves_file_and_returns_dict(self, mock_file, mock_join): + # Setup + self.iai.vlm_interface.describe_video_frames.return_value = "Video Summary Content" + + # Execute + result = self.iai.understand_video("some_video.mp4", query="What happens?") + + # Verify call to VLM + self.iai.vlm_interface.describe_video_frames.assert_called_once_with( + "some_video.mp4", query="What happens?", max_frames=8 + ) + + # Verify file saving + mock_file.assert_called_once() + handle = mock_file() + handle.write.assert_called_once_with("Video Summary Content") + + # Verify return dict + self.assertEqual(result['status'], 'success') + self.assertTrue(result['file_saved']) + self.assertIn('video_summary_', result['file_path']) + self.assertEqual(result['summary'], "Video Summary Content") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_step2_internal_action_interface.py b/tests/test_step2_internal_action_interface.py new file mode 100644 index 00000000..8e8e8d0c --- /dev/null +++ b/tests/test_step2_internal_action_interface.py @@ -0,0 +1,599 @@ +# tests/test_step2_internal_action_interface.py +# -*- coding: utf-8 -*- +""" +Step 2 Verification Suite — InternalActionInterface Extensions +Tests for: perform_ocr() and understand_video() classmethods + +Run with: + python -m pytest tests/test_step2_internal_action_interface.py -v + +ALL tests must pass. Zero real API calls. Zero real file system dependency +outside of tempfile — all workspace writes use a patched AGENT_WORKSPACE_ROOT. + +PREREQUISITE: Step 1 tests must already be passing before running these. +""" + +from __future__ import annotations + +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch, PropertyMock + + +# ───────────────────────────────────────────────────────────────── +# HELPERS +# ───────────────────────────────────────────────────────────────── + +def _reset_iai(): + """Reset InternalActionInterface class-level state between tests.""" + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.vlm_interface = None + InternalActionInterface.llm_interface = None + InternalActionInterface.task_manager = None + InternalActionInterface.state_manager = None + + +def _inject_mock_vlm(mock_vlm=None): + """Inject a mock VLMInterface into InternalActionInterface.""" + from app.internal_action_interface import InternalActionInterface + if mock_vlm is None: + mock_vlm = MagicMock() + InternalActionInterface.vlm_interface = mock_vlm + return mock_vlm + + +# ───────────────────────────────────────────────────────────────── +# SECTION A: Method Existence & Signatures +# ───────────────────────────────────────────────────────────────── + +class TestMethodExistence(unittest.TestCase): + """ + VERIFY: Both new classmethods exist and are classmethods (not staticmethods + or instance methods), matching the pattern of describe_image(). + """ + + def test_perform_ocr_exists(self): + from app.internal_action_interface import InternalActionInterface + self.assertTrue( + hasattr(InternalActionInterface, "perform_ocr"), + "FAIL: InternalActionInterface.perform_ocr not found. " + "Add it to app/internal_action_interface.py" + ) + + def test_understand_video_exists(self): + from app.internal_action_interface import InternalActionInterface + self.assertTrue( + hasattr(InternalActionInterface, "understand_video"), + "FAIL: InternalActionInterface.understand_video not found. " + "Add it to app/internal_action_interface.py" + ) + + def test_perform_ocr_is_classmethod(self): + """perform_ocr must be a classmethod, not a staticmethod or instance method.""" + from app.internal_action_interface import InternalActionInterface + method = InternalActionInterface.__dict__.get("perform_ocr") + self.assertIsInstance( + method, classmethod, + "FAIL: perform_ocr must be a @classmethod (matching describe_image pattern)." + ) + + def test_understand_video_is_classmethod(self): + """understand_video must be a classmethod.""" + from app.internal_action_interface import InternalActionInterface + method = InternalActionInterface.__dict__.get("understand_video") + self.assertIsInstance( + method, classmethod, + "FAIL: understand_video must be a @classmethod." + ) + + def test_perform_ocr_accepts_image_path(self): + """perform_ocr must accept image_path as its first positional argument.""" + import inspect + from app.internal_action_interface import InternalActionInterface + sig = inspect.signature(InternalActionInterface.perform_ocr) + params = list(sig.parameters.keys()) + self.assertIn("image_path", params, + f"FAIL: perform_ocr must accept 'image_path'. Got params: {params}") + + def test_understand_video_accepts_video_path_and_query(self): + """understand_video must accept video_path and query parameters.""" + import inspect + from app.internal_action_interface import InternalActionInterface + sig = inspect.signature(InternalActionInterface.understand_video) + params = list(sig.parameters.keys()) + self.assertIn("video_path", params, + f"FAIL: understand_video must accept 'video_path'. Got: {params}") + self.assertIn("query", params, + f"FAIL: understand_video must accept 'query'. Got: {params}") + + def tearDown(self): + _reset_iai() + + +# ───────────────────────────────────────────────────────────────── +# SECTION B: VLM Guard — RuntimeError when not initialized +# ───────────────────────────────────────────────────────────────── + +class TestVLMGuard(unittest.TestCase): + """ + VERIFY: Both methods raise RuntimeError when vlm_interface is None, + matching the guard pattern of describe_image() and describe_screen(). + """ + + def setUp(self): + _reset_iai() + + def test_perform_ocr_raises_when_vlm_not_initialized(self): + from app.internal_action_interface import InternalActionInterface + # vlm_interface is None (default state) + with self.assertRaises(RuntimeError) as ctx: + InternalActionInterface.perform_ocr("/some/image.png") + self.assertIn( + "VLMInterface", str(ctx.exception), + "FAIL: RuntimeError message must mention 'VLMInterface' to match " + "existing error message pattern in describe_image/describe_screen." + ) + + def test_understand_video_raises_when_vlm_not_initialized(self): + from app.internal_action_interface import InternalActionInterface + with self.assertRaises(RuntimeError) as ctx: + InternalActionInterface.understand_video("/some/video.mp4") + self.assertIn( + "VLMInterface", str(ctx.exception), + "FAIL: RuntimeError message must mention 'VLMInterface'." + ) + + def tearDown(self): + _reset_iai() + + +# ───────────────────────────────────────────────────────────────── +# SECTION C: perform_ocr — Return Contract +# ───────────────────────────────────────────────────────────────── + +class TestPerformOcrReturnContract(unittest.TestCase): + """ + VERIFY: perform_ocr returns a dict with the correct keys, + correct types, and saves extracted text to AGENT_WORKSPACE_ROOT. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def _run_perform_ocr(self, ocr_text="Hello World\nLine 2\nLine 3"): + """Helper: run perform_ocr with a temp image and mocked VLM.""" + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = ocr_text + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.perform_ocr(image_path) + finally: + os.unlink(image_path) + + return result, mock_vlm + + def test_returns_dict(self): + result, _ = self._run_perform_ocr() + self.assertIsInstance(result, dict, + "FAIL: perform_ocr must return a dict, not a plain string.") + + def test_return_dict_has_required_keys(self): + """Must have: status, summary, file_path, file_saved.""" + result, _ = self._run_perform_ocr() + for key in ("status", "summary", "file_path", "file_saved"): + self.assertIn(key, result, + f"FAIL: perform_ocr return dict is missing key '{key}'.") + + def test_status_is_success_on_happy_path(self): + result, _ = self._run_perform_ocr() + self.assertEqual(result["status"], "success", + "FAIL: status must be 'success' on happy path.") + + def test_file_saved_is_true(self): + result, _ = self._run_perform_ocr() + self.assertTrue(result["file_saved"], + "FAIL: file_saved must be True after successful OCR.") + + def test_file_path_exists_on_disk(self): + """The file_path in the result must be a real file that was written.""" + result, _ = self._run_perform_ocr("Invoice #1234\nTotal: $99.99") + self.assertTrue( + os.path.isfile(result["file_path"]), + f"FAIL: file_path '{result['file_path']}' does not exist on disk. " + "perform_ocr must write the extracted text to workspace." + ) + + def test_file_content_matches_ocr_output(self): + """The saved file must contain the raw OCR text exactly as returned by VLM.""" + ocr_text = "CONFIDENTIAL\nProject Alpha\nBudget: $1,000,000" + result, _ = self._run_perform_ocr(ocr_text) + + with open(result["file_path"], "r", encoding="utf-8") as f: + saved_content = f.read() + + self.assertEqual(saved_content, ocr_text, + "FAIL: Saved file content does not match OCR output. " + "The raw text must be written verbatim — no modification.") + + def test_file_saved_to_agent_workspace_root(self): + """The saved file must be inside AGENT_WORKSPACE_ROOT, not a temp dir.""" + result, _ = self._run_perform_ocr() + self.assertTrue( + result["file_path"].startswith(self.tmp_workspace), + f"FAIL: File saved to '{result['file_path']}' but expected " + f"it to be inside AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'. " + "Do not hardcode paths — use AGENT_WORKSPACE_ROOT from app.config." + ) + + def test_file_has_txt_extension(self): + """Output file must be a .txt file (readable by do_chat_with_attachments).""" + result, _ = self._run_perform_ocr() + self.assertTrue( + result["file_path"].endswith(".txt"), + f"FAIL: Output file must have .txt extension. Got: '{result['file_path']}'" + ) + + def test_summary_does_not_contain_full_text(self): + """ + Summary must be a SHORT description, not the full OCR text. + The whole point of saving to file is to keep the agent context lean. + If summary == full text, the TUI flooding problem is not solved. + """ + long_text = "Line\n" * 200 # 200 lines, definitely not a summary + result, _ = self._run_perform_ocr(long_text) + self.assertLess( + len(result["summary"]), len(long_text), + "FAIL: summary contains the full OCR text. It must be a short " + "description (e.g. 'OCR complete: 200 lines, 1000 characters') " + "to prevent context window flooding." + ) + + def test_summary_mentions_line_or_char_count(self): + """Summary must be informative — mention lines or characters extracted.""" + result, _ = self._run_perform_ocr("Hello\nWorld") + summary_lower = result["summary"].lower() + has_count_info = ( + "line" in summary_lower or + "char" in summary_lower or + "word" in summary_lower or + "extracted" in summary_lower + ) + self.assertTrue(has_count_info, + f"FAIL: summary '{result['summary']}' is not informative. " + "It must mention lines/characters extracted so the agent knows what happened.") + + def test_calls_describe_image_ocr_not_describe_image(self): + """ + CRITICAL: Must call vlm_interface.describe_image_ocr(), NOT + vlm_interface.describe_image(). Using describe_image is exactly + the existing bug that Issue #155 was filed for. + """ + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = "Some text" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.perform_ocr(image_path) + finally: + os.unlink(image_path) + + mock_vlm.describe_image_ocr.assert_called_once() + mock_vlm.describe_image.assert_not_called() + + def test_user_prompt_forwarded_to_vlm(self): + """Optional user_prompt must be passed through to vlm.describe_image_ocr.""" + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = "text" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.perform_ocr(image_path, user_prompt="Focus on prices only.") + finally: + os.unlink(image_path) + + call_kwargs = mock_vlm.describe_image_ocr.call_args + # Check the user_prompt was forwarded (positional or keyword) + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn("Focus on prices only.", all_args, + "FAIL: user_prompt was not forwarded to vlm_interface.describe_image_ocr(). " + "The OCR method must pass user_prompt through.") + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +# ───────────────────────────────────────────────────────────────── +# SECTION D: understand_video — Return Contract +# ───────────────────────────────────────────────────────────────── + +class TestUnderstandVideoReturnContract(unittest.TestCase): + """ + VERIFY: understand_video returns a correct dict, saves summary to + workspace, truncates summary to prevent TUI flooding, and + forwards all parameters correctly to vlm_interface. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def _run_understand_video(self, summary_text="The video shows a presentation.", query=None, max_frames=8): + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = summary_text + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.understand_video( + video_path, query=query, max_frames=max_frames + ) + finally: + os.unlink(video_path) + + return result, mock_vlm + + def test_returns_dict(self): + result, _ = self._run_understand_video() + self.assertIsInstance(result, dict, + "FAIL: understand_video must return a dict.") + + def test_return_dict_has_required_keys(self): + result, _ = self._run_understand_video() + for key in ("status", "summary", "file_path", "file_saved"): + self.assertIn(key, result, + f"FAIL: understand_video return dict is missing key '{key}'.") + + def test_status_is_success_on_happy_path(self): + result, _ = self._run_understand_video() + self.assertEqual(result["status"], "success") + + def test_file_saved_is_true(self): + result, _ = self._run_understand_video() + self.assertTrue(result["file_saved"]) + + def test_file_path_exists_on_disk(self): + result, _ = self._run_understand_video("A meeting recording with 3 participants.") + self.assertTrue( + os.path.isfile(result["file_path"]), + f"FAIL: file_path '{result['file_path']}' does not exist. " + "understand_video must write the full summary to workspace." + ) + + def test_full_summary_saved_to_file(self): + """The full, untruncated summary must be in the saved file.""" + long_summary = "Frame description. " * 100 # deliberately long + result, _ = self._run_understand_video(long_summary) + + with open(result["file_path"], "r", encoding="utf-8") as f: + saved = f.read() + + self.assertEqual(saved, long_summary, + "FAIL: The saved file must contain the FULL summary. " + "Truncation only applies to the return dict's 'summary' key.") + + def test_summary_in_return_dict_is_truncated_for_long_content(self): + """ + For long video summaries, the 'summary' key in the returned dict + must be truncated (<=500 chars + ellipsis) to prevent context flooding. + The full content is in the file — the dict summary is just a preview. + """ + long_summary = "X" * 2000 + result, _ = self._run_understand_video(long_summary) + self.assertLessEqual( + len(result["summary"]), 510, # 500 + len("...") + f"FAIL: summary in return dict is {len(result['summary'])} chars. " + "Must be truncated to ~500 chars to prevent agent context flooding." + ) + + def test_short_summary_not_truncated(self): + """Short summaries (<=500 chars) must be returned as-is without ellipsis.""" + short_summary = "A quick 30-second tutorial on Python loops." + result, _ = self._run_understand_video(short_summary) + self.assertEqual(result["summary"], short_summary, + "FAIL: Short summary was unexpectedly truncated or modified.") + + def test_file_saved_to_agent_workspace_root(self): + result, _ = self._run_understand_video() + self.assertTrue( + result["file_path"].startswith(self.tmp_workspace), + f"FAIL: File saved to wrong location. Expected inside " + f"AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'." + ) + + def test_file_has_txt_extension(self): + result, _ = self._run_understand_video() + self.assertTrue(result["file_path"].endswith(".txt"), + "FAIL: Output file must be .txt") + + def test_video_filename_distinct_from_ocr_filename(self): + """ + Video summary files must have a distinct filename prefix from OCR files + to avoid confusion in workspace (e.g. 'video_summary_' vs 'ocr_result_'). + """ + result, _ = self._run_understand_video() + filename = os.path.basename(result["file_path"]) + self.assertFalse( + filename.startswith("ocr_"), + f"FAIL: Video summary file '{filename}' starts with 'ocr_'. " + "Video and OCR output files must have distinct prefixes." + ) + + def test_calls_describe_video_frames_not_describe_image(self): + """Must delegate to vlm_interface.describe_video_frames(), not describe_image().""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "summary" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path) + finally: + os.unlink(video_path) + + mock_vlm.describe_video_frames.assert_called_once() + mock_vlm.describe_image.assert_not_called() + + def test_query_forwarded_to_vlm(self): + """The query parameter must be forwarded to describe_video_frames.""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "answer" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path, query="What is on slide 3?") + finally: + os.unlink(video_path) + + call_kwargs = mock_vlm.describe_video_frames.call_args + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn("What is on slide 3?", all_args, + "FAIL: query not forwarded to describe_video_frames.") + + def test_max_frames_forwarded_to_vlm(self): + """max_frames must be forwarded to describe_video_frames.""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "summary" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path, max_frames=12) + finally: + os.unlink(video_path) + + call_kwargs = mock_vlm.describe_video_frames.call_args + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn(12, all_args, + "FAIL: max_frames=12 was not forwarded to describe_video_frames.") + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +# ───────────────────────────────────────────────────────────────── +# SECTION E: Regression — existing methods untouched +# ───────────────────────────────────────────────────────────────── + +class TestRegressionExistingMethods(unittest.TestCase): + """ + REGRESSION GUARD: Ensure describe_image(), describe_screen(), + and initialize() still work exactly as before Step 2. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def test_describe_image_still_raises_when_vlm_none(self): + from app.internal_action_interface import InternalActionInterface + with self.assertRaises(RuntimeError): + InternalActionInterface.describe_image("/any/path.png") + + def test_describe_image_still_returns_string(self): + """describe_image must still return str (not dict) — contract unchanged.""" + mock_vlm = MagicMock() + mock_vlm.describe_image.return_value = "A photo of a cat." + _inject_mock_vlm(mock_vlm) + + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.describe_image("/fake/path.png") + self.assertIsInstance(result, str, + "REGRESSION: describe_image must still return str, not dict.") + self.assertEqual(result, "A photo of a cat.") + + def test_initialize_still_sets_vlm_interface(self): + """initialize() must still correctly set vlm_interface class attribute.""" + from app.internal_action_interface import InternalActionInterface + + mock_vlm = MagicMock() + mock_llm = MagicMock() + mock_task = MagicMock() + mock_state = MagicMock() + + InternalActionInterface.initialize( + llm_interface=mock_llm, + task_manager=mock_task, + state_manager=mock_state, + vlm_interface=mock_vlm, + ) + + self.assertIs(InternalActionInterface.vlm_interface, mock_vlm, + "REGRESSION: initialize() no longer sets vlm_interface correctly.") + + def test_new_methods_do_not_shadow_describe_image(self): + """ + perform_ocr and understand_video must not accidentally override + or shadow describe_image on the class. + """ + from app.internal_action_interface import InternalActionInterface + # All three must coexist independently + self.assertTrue(hasattr(InternalActionInterface, "describe_image")) + self.assertTrue(hasattr(InternalActionInterface, "perform_ocr")) + self.assertTrue(hasattr(InternalActionInterface, "understand_video")) + + # describe_image must still delegate to vlm.describe_image + mock_vlm = MagicMock() + mock_vlm.describe_image.return_value = "original image description" + _inject_mock_vlm(mock_vlm) + + result = InternalActionInterface.describe_image("/fake.png") + mock_vlm.describe_image.assert_called_once() + # describe_image_ocr must NOT have been called + mock_vlm.describe_image_ocr.assert_not_called() + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_step3_perform_ocr_action.py b/tests/test_step3_perform_ocr_action.py new file mode 100644 index 00000000..31a55f44 --- /dev/null +++ b/tests/test_step3_perform_ocr_action.py @@ -0,0 +1,129 @@ +# tests/test_step3_perform_ocr_action.py +""" +Step 3 Verification: perform_ocr action layer tests. +Tests input validation, simulated mode, schema contract, +and bridge delegation — without making real VLM calls. +""" +import os +import pytest +from unittest.mock import patch, MagicMock + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +def load_action(image_path: str, simulated: bool = False) -> dict: + """Import and invoke the action directly.""" + from app.data.action.perform_ocr import execute + return execute({"image_path": image_path, "simulated_mode": simulated}) + + +# ── 1. Input Validation ──────────────────────────────────────────────────── + +class TestInputValidation: + + def test_missing_image_path_key(self): + from app.data.action.perform_ocr import execute + result = execute({}) + assert result["status"] == "error" + assert "image_path" in result["message"].lower() + + def test_empty_image_path_string(self): + result = load_action("") + assert result["status"] == "error" + + def test_nonexistent_file_path(self): + result = load_action("/tmp/does_not_exist_12345.png") + assert result["status"] == "error" + assert "not found" in result["message"].lower() or \ + "does not exist" in result["message"].lower() or \ + result["status"] == "error" + + def test_path_is_directory_not_file(self, tmp_path): + result = load_action(str(tmp_path)) # directory, not a file + assert result["status"] == "error" + + +# ── 2. Simulated Mode ────────────────────────────────────────────────────── + +class TestSimulatedMode: + + def test_simulated_mode_returns_success(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + result = load_action(str(fake_image), simulated=True) + assert result["status"] == "success" + + def test_simulated_mode_makes_no_vlm_call(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr") as mock_ocr: + load_action(str(fake_image), simulated=True) + mock_ocr.assert_not_called() + + def test_simulated_mode_result_is_string(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + result = load_action(str(fake_image), simulated=True) + # In simulated mode, summary or message might be the string + assert isinstance(result.get("summary") or result.get("message"), str) + + +# ── 3. Schema Contract ───────────────────────────────────────────────────── + +class TestSchemaContract: + + def test_success_response_has_required_keys(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "Invoice #1234", "file_path": "/tmp/ocr.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return): + result = load_action(str(fake_image)) + assert "status" in result + assert result["status"] in ("success", "error") + + def test_error_response_has_message(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + side_effect=RuntimeError("VLM unavailable")): + result = load_action(str(fake_image)) + assert result["status"] == "error" + assert "message" in result + assert len(result["message"]) > 0 + + def test_success_exposes_extracted_text(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "Hello World", "file_path": "/tmp/ocr.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return): + result = load_action(str(fake_image)) + # The action must surface the text somewhere — either in result["text"], + # result["result"], or result["message"] + combined = str(result) + assert "Hello World" in combined + + +# ── 4. Bridge Delegation ─────────────────────────────────────────────────── + +class TestBridgeDelegation: + + def test_delegates_correct_image_path_to_bridge(self, tmp_path): + fake_image = tmp_path / "receipt.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "some text", "file_path": "/tmp/x.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return) as mock_bridge: + load_action(str(fake_image)) + called_path = mock_bridge.call_args[0][0] + assert called_path == str(fake_image) + + def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): + result = load_action(str(fake_image)) + assert result["status"] == "error" + assert "message" in result diff --git a/tests/test_step4_understand_video_action.py b/tests/test_step4_understand_video_action.py new file mode 100644 index 00000000..619dacc0 --- /dev/null +++ b/tests/test_step4_understand_video_action.py @@ -0,0 +1,116 @@ +# tests/test_step4_understand_video_action.py + +import pytest +from unittest.mock import patch + +def load_action(video_path: str, query: str = "", simulated: bool = False) -> dict: + from app.data.action.understand_video import execute + return execute({ + "video_path": video_path, + "query": query, + "simulated_mode": simulated, + }) + + +class TestInputValidation: + + def test_missing_video_path_key(self): + from app.data.action.understand_video import execute + result = execute({}) + assert result["status"] == "error" + assert "video_path" in result["message"].lower() + + def test_empty_video_path_string(self): + result = load_action("") + assert result["status"] == "error" + + def test_nonexistent_file_path(self): + result = load_action("/tmp/does_not_exist_98765.mp4") + assert result["status"] == "error" + + def test_path_is_directory_not_file(self, tmp_path): + result = load_action(str(tmp_path)) + assert result["status"] == "error" + + +class TestSimulatedMode: + + def test_simulated_mode_returns_success(self, tmp_path): + fake_video = tmp_path / "test.mp4" + fake_video.write_bytes(b"fake_video_bytes") + result = load_action(str(fake_video), simulated=True) + assert result["status"] == "success" + + def test_simulated_mode_makes_no_vlm_call(self, tmp_path): + fake_video = tmp_path / "test.mp4" + fake_video.write_bytes(b"fake_video_bytes") + with patch("app.internal_action_interface.InternalActionInterface.understand_video") as mock_bridge: + load_action(str(fake_video), simulated=True) + mock_bridge.assert_not_called() + + +class TestSchemaContract: + + def test_success_response_has_required_keys(self, tmp_path): + fake_video = tmp_path / "clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + mock_return = { + "status": "success", + "summary": "A person walks into a room.", + "preview": "A person walks...", + "file_path": "/tmp/video_summary.txt", + } + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + return_value=mock_return): + result = load_action(str(fake_video), query="What happens?") + + assert result["status"] == "success" + for key in ("summary", "file_path"): + assert key in result + + def test_error_response_has_message(self, tmp_path): + fake_video = tmp_path / "clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + side_effect=RuntimeError("VLM unavailable")): + result = load_action(str(fake_video)) + + assert result["status"] == "error" + assert "message" in result + assert len(result["message"]) > 0 + + +class TestBridgeDelegation: + + def test_delegates_correct_video_path_and_query(self, tmp_path): + fake_video = tmp_path / "scene.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + mock_return = { + "status": "success", + "summary": "Some summary", + "preview": "Some...", + "file_path": "/tmp/video_summary.txt", + } + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + return_value=mock_return) as mock_bridge: + load_action(str(fake_video), query="Who is present?") + + # Verify bridge call arguments + # In some versions of mock, call_args[0] is positional args + called_args = mock_bridge.call_args[0] + assert called_args[0] == str(fake_video) + assert mock_bridge.call_args[1].get('query') == "Who is present?" or called_args[1] == "Who is present?" + + def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): + fake_video = tmp_path / "scene.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): + result = load_action(str(fake_video)) + + assert result["status"] == "error" + assert "message" in result From fa0284eb092495e3c171d6ad5cb1e4f40196927e Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 18:05:29 +0530 Subject: [PATCH 02/81] improvement: use Gemini native video API as primary path in understand_video, OpenCV as fallback --- app/data/action/understand_video.py | 43 ++++++++++++++- tests/test_step4_understand_video_action.py | 61 +++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index d40b4dfb..e4c5c77d 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -2,9 +2,10 @@ @action( name="understand_video", - description="Analyses a video file by sampling keyframes and generating a narrative summary using a Vision Language Model. Use when the user shares a video and wants to know what happens in it, extract visible text, or answer a specific question about video content.", + description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", action_sets=["document_processing, image"], + requirement=["google-generativeai"], input_schema={ "video_path": { "type": "string", @@ -79,6 +80,46 @@ def understand_video(input_data: dict) -> dict: if not os.path.isfile(video_path): return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + from app.config import get_api_key + api_key = get_api_key('gemini') + + if api_key: + try: + import google.generativeai as genai + genai.configure(api_key=api_key) + import time + from datetime import datetime + from app.config import AGENT_WORKSPACE_ROOT + + video_file = genai.upload_file(path=video_path) + + while video_file.state.name == "PROCESSING": + time.sleep(2) + video_file = genai.get_file(video_file.name) + + model = genai.GenerativeModel("gemini-1.5-pro") + prompt = query if query else "Understand and describe the contents of this video." + response = model.generate_content([video_file, prompt]) + + genai.delete_file(video_file.name) + + full_text = response.text + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(full_text) + + return { + 'status': 'success', + 'summary': full_text[:500] + ("..." if len(full_text) > 500 else ""), + 'file_path': out_path, + 'file_saved': True, + 'message': '' + } + except Exception as e: + # Fall through to fallback path if Gemini native path fails + pass + try: import app.internal_action_interface as iai result = iai.InternalActionInterface.understand_video(video_path, query=query, max_frames=max_frames) diff --git a/tests/test_step4_understand_video_action.py b/tests/test_step4_understand_video_action.py index 619dacc0..77883701 100644 --- a/tests/test_step4_understand_video_action.py +++ b/tests/test_step4_understand_video_action.py @@ -114,3 +114,64 @@ def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): assert result["status"] == "error" assert "message" in result + + +class TestPrimaryGeminiPath: + + @patch("app.config.get_api_key") + @patch("google.generativeai.upload_file") + @patch("google.generativeai.GenerativeModel") + @patch("google.generativeai.delete_file") + def test_gemini_path_success(self, mock_delete, mock_generative_model, mock_upload, mock_get_api_key, tmp_path): + from unittest.mock import MagicMock + mock_get_api_key.return_value = "fake_google_key" + + mock_file = MagicMock() + mock_file.name = "fake_video_name" + mock_file.state.name = "ACTIVE" + mock_upload.return_value = mock_file + + mock_model_instance = MagicMock() + mock_response = MagicMock() + mock_response.text = "This is a native Gemini summary of the video. " * 20 + mock_model_instance.generate_content.return_value = mock_response + mock_generative_model.return_value = mock_model_instance + + fake_video = tmp_path / "gemini_clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.config.AGENT_WORKSPACE_ROOT", str(tmp_path)): + result = load_action(str(fake_video), query="What happens?") + + assert result["status"] == "success" + assert "native Gemini summary" in result["summary"] + assert result["file_saved"] is True + + mock_upload.assert_called_once() + mock_model_instance.generate_content.assert_called_once() + mock_delete.assert_called_once_with(mock_file.name) + + @patch("app.config.get_api_key") + @patch("app.internal_action_interface.InternalActionInterface.understand_video") + def test_fallback_path_triggered(self, mock_bridge, mock_get_api_key, tmp_path): + mock_get_api_key.return_value = None + + mock_return = { + "status": "success", + "summary": "Fallback summary", + "file_path": "/tmp/fallback.txt", + "file_saved": True, + "message": "" + } + mock_bridge.return_value = mock_return + + fake_video = tmp_path / "fallback_clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + result = load_action(str(fake_video), query="Fallback query") + + assert result["status"] == "success" + assert result["summary"] == "Fallback summary" + mock_bridge.assert_called_once() + called_args = mock_bridge.call_args[0] + assert called_args[0] == str(fake_video) From 8050b7c3dd09d8e8a27a3b77b20b8aefa1a1772a Mon Sep 17 00:00:00 2001 From: zfoong Date: Thu, 16 Apr 2026 21:55:22 +0900 Subject: [PATCH 03/81] improvement:CI and readme update --- .github/workflows/ci.yml | 49 ++++++++++++++++++++++++++++++++++++++++ CONTRIBUTING.md | 10 ++++---- README.cn.md | 13 ++++++----- README.ja.md | 13 ++++++----- README.md | 15 ++++++------ 5 files changed, 76 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..7f61cae4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,49 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + - dev + - "V*" + +jobs: + lint: + name: Lint (ruff) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: pip + + - name: Install ruff + run: pip install ruff + + - name: Check formatting + run: ruff format --check . + + - name: Run ruff check + run: ruff check . + + smoke: + name: Smoke (syntax + imports) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Byte-compile source tree + run: python -m compileall -q app agent_core agents decorators skills diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fd958734..b45392e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ To ensure contributor feels welcome, we have this guide to help you get started ## 🌟 Links - [Discord Community](https://discord.gg/W8jdMKdE) -- [Issue Tracker](https://github.com/zfoong/CraftBot/issues) +- [Issue Tracker](https://github.com/CraftOS-dev/CraftBot/issues) ## 1. 🚀 Ways to Contribute @@ -24,7 +24,7 @@ Here are all the things you to contribute to the community. ## 📫 There are several ways to collaborate with the team and community: ### GitHub Collaboration -- [Open an issue](https://github.com/zfoong/CraftBot/issues) for bug reports, feature requests, or discussions +- [Open an issue](https://github.com/CraftOS-dev/CraftBot/issues) for bug reports, feature requests, or discussions - Submit pull requests to contribute code or documentation - Join ongoing discussions in existing issues and PRs @@ -45,7 +45,7 @@ For faster responses, consider using our Discord channel where the whole communi ### Fork and Clone -1. Fork the [**CraftBot**](https://github.com/zfoong/CraftBot) repository +1. Fork the [**CraftBot**](https://github.com/CraftOS-dev/CraftBot) repository 2. Clone your fork: ```shell git clone https://github.com//CraftBot.git @@ -85,7 +85,7 @@ git push origin your-branch-name ``` 2. Create a Pull Request: - - Go to the [**CraftBot** repository](https://github.com/zfoong/CraftBot) + - Go to the [**CraftBot** repository](https://github.com/CraftOS-dev/CraftBot) - Click "Compare & Pull Request" and open a PR against dev branch - Fill in the PR template with details about your changes @@ -99,7 +99,7 @@ git push origin your-branch-name ## 6. 📫 To Get Help -- Open an [issue](https://github.com/zfoong/CraftBot) +- Open an [issue](https://github.com/CraftOS-dev/CraftBot) - Join our Discord community Thank you for contributing to **CraftBot**! 🌟 \ No newline at end of file diff --git a/README.cn.md b/README.cn.md index 96630f15..cde213ea 100644 --- a/README.cn.md +++ b/README.cn.md @@ -6,13 +6,14 @@
Windows + macOS Linux - - GitHub Repo stars + + GitHub Repo stars - License + License Discord @@ -52,7 +53,7 @@ CraftBot 静候你的指令,现在就部署属于你的 CraftBot 吧。 - **外部工具集成** — 连接 Google Workspace、Slack、Notion、Zoom、LinkedIn、Discord 和 Telegram(更多即将推出!),支持嵌入式凭据和 OAuth。 - **MCP** — 模型上下文协议(Model Context Protocol)集成,通过外部工具和服务扩展代理能力。 - **技能系统** — 可扩展的技能框架,内置任务规划、研究、代码审查、Git 操作等技能。 -- **跨平台** — 完整支持 Windows 和 Linux,具有平台特定代码变体和 Docker 容器化。 +- **跨平台** — 完整支持 Windows、macOS 和 Linux,具有平台特定代码变体和 Docker 容器化。 > [!IMPORTANT] > **关于 GUI 模式的说明:** GUI 模式仍处于实验阶段。代理切换到 GUI 模式时可能会遇到一些问题。我们正在积极改进此功能。 @@ -78,7 +79,7 @@ CraftBot 静候你的指令,现在就部署属于你的 CraftBot 吧。 ```bash # 克隆仓库 -git clone https://github.com/zfoong/CraftBot.git +git clone https://github.com/CraftOS-dev/CraftBot.git cd CraftBot # 安装依赖 @@ -424,7 +425,7 @@ GUI 操作(鼠标/键盘事件、截图)需要 X11 服务器。你可以连 ## 🤝 如何贡献 -欢迎各种建议与反馈!你可以联系 [@zfoong](https://github.com/zfoong),邮箱为 thamyikfoong(at)craftos.net。我们目前尚未配置检查流程,因此无法接受直接提交贡献,但非常感谢你的建议与反馈。 +欢迎提交 PR!请参阅 [CONTRIBUTING.md](CONTRIBUTING.md) 了解工作流程(fork → 从 `dev` 分支新建分支 → 提交 PR)。所有 Pull Request 都会自动运行 lint + 烟雾测试 CI。如需快速沟通,可加入我们的 [Discord](https://discord.gg/ZN9YHc37HG) 或发送邮件至 thamyikfoong(at)craftos.net。 ## 🧾 许可证 diff --git a/README.ja.md b/README.ja.md index c5f3cd10..8d77cbb4 100644 --- a/README.ja.md +++ b/README.ja.md @@ -6,13 +6,14 @@
Windows + macOS Linux - - GitHub Repo stars + + GitHub Repo stars - License + License Discord @@ -52,7 +53,7 @@ CraftBotはあなたの命令を待っています。今すぐあなた専用の - **外部ツール統合** — 埋め込みクレデンシャルとOAuthサポートにより、Google Workspace、Slack、Notion、Zoom、LinkedIn、Discord、Telegramに接続(今後さらに追加予定!)。 - **MCP** — 外部ツールやサービスでエージェント機能を拡張するためのModel Context Protocol統合。 - **スキル** — タスク計画、リサーチ、コードレビュー、Git操作などの組み込みスキルを含む拡張可能なスキルフレームワーク。 -- **クロスプラットフォーム** — プラットフォーム固有のコードバリアントとDockerコンテナ化によるWindowsとLinuxの完全サポート。 +- **クロスプラットフォーム** — プラットフォーム固有のコードバリアントとDockerコンテナ化によるWindows、macOS、Linuxの完全サポート。 > [!IMPORTANT] > **GUIモードに関する注意:** GUIモードはまだ実験段階です。エージェントがGUIモードに切り替える際に問題が発生する可能性があります。この機能の改善に積極的に取り組んでいます。 @@ -78,7 +79,7 @@ CraftBotはあなたの命令を待っています。今すぐあなた専用の ```bash # リポジトリをクローン -git clone https://github.com/zfoong/CraftBot.git +git clone https://github.com/CraftOS-dev/CraftBot.git cd CraftBot # 依存関係をインストール @@ -424,7 +425,7 @@ GUIアクション(マウス/キーボードイベント、スクリーンシ ## 🤝 貢献方法 -貢献と提案を歓迎します![@zfoong](https://github.com/zfoong) @ thamyikfoong(at)craftos.net までご連絡ください。現在、チェック機能を設定していないため、直接的な貢献は受け付けられませんが、提案やフィードバックは大変ありがたく思います。 +プルリクエストを歓迎します!ワークフロー(fork → `dev` ブランチから分岐 → PR)については [CONTRIBUTING.md](CONTRIBUTING.md) をご覧ください。すべてのプルリクエストは lint + スモークテスト CI で自動的に検証されます。質問や素早いやり取りをご希望の場合は、[Discord](https://discord.gg/ZN9YHc37HG) に参加するか、thamyikfoong(at)craftos.net までメールしてください。 ## 🧾 ライセンス diff --git a/README.md b/README.md index 38b74939..666b19f9 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,14 @@
Windows + macOS Linux - - - GitHub Repo stars + + + GitHub Repo stars - License + License Discord @@ -52,7 +53,7 @@ CraftBot awaits your orders. Set up your own CraftBot now. - **External Tools Integration** — Connect to Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord, and Telegram (more to come!) with embedded credentials and OAuth support. - **MCP** — Model Context Protocol integration for extending agent capabilities with external tools and services. - **Skills** — Extensible skill framework with built-in skills for task planning, research, code review, git operations, and more. -- **Cross-Platform** — Full support for Windows and Linux with platform-specific code variants and Docker containerization. +- **Cross-Platform** — Full support for Windows, macOS, and Linux with platform-specific code variants and Docker containerization. > [!IMPORTANT] > **Note for GUI mode:** The GUI mode is still in experimental phase. This means you may encounter issues when the agent switches to GUI mode. We are actively improving this feature. @@ -78,7 +79,7 @@ CraftBot awaits your orders. Set up your own CraftBot now. ```bash # Clone the repository -git clone https://github.com/zfoong/CraftBot.git +git clone https://github.com/CraftOS-dev/CraftBot.git cd CraftBot # Install dependencies @@ -469,7 +470,7 @@ By default the image uses Python 3.10 and bundles the Python dependencies from ` ## 🤝 How to Contribute -Contributions and suggestions are welcome! You can contact [@zfoong](https://github.com/zfoong) @ thamyikfoong(at)craftos.net. We currently don't have checks set up, so we can't allow direct contributions but we appreciate any suggestions and feedback. +PRs are welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for the workflow (fork → branch from `dev` → PR). All pull requests run through lint + smoke-test CI automatically. For questions or a faster conversation, join us on [Discord](https://discord.gg/ZN9YHc37HG) or email thamyikfoong(at)craftos.net. ## 🧾 License From 6915894d4160020c5f093deb6f19f368d5a9bb4c Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:25:36 +0530 Subject: [PATCH 04/81] fix(vlm): remove response_format json_object from byteplus, re-raise exceptions in describe_image_bytes --- agent_core/core/impl/vlm/interface.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index 455de4af..b4c7aed4 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -259,7 +259,7 @@ def describe_image_bytes( return cleaned except Exception as e: logger.error(f"[ERROR] {e}") - return "" + raise async def generate_response_async( self, @@ -624,7 +624,6 @@ def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str "messages": messages, "temperature": self.temperature, "max_tokens": 2048, - "response_format": {"type": "json_object"}, } headers = { "Content-Type": "application/json", From 125cff4368260ed302c3faa6783b85f6f3a59f7e Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:25:57 +0530 Subject: [PATCH 05/81] fix(actions): split action_sets string into proper list in perform_ocr and understand_video --- app/data/action/perform_ocr.py | 2 +- app/data/action/understand_video.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py index 3c1d01d9..85c2a5d6 100644 --- a/app/data/action/perform_ocr.py +++ b/app/data/action/perform_ocr.py @@ -4,7 +4,7 @@ name="perform_ocr", description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], input_schema={ "image_path": { "type": "string", diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index e4c5c77d..8c280419 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -4,7 +4,7 @@ name="understand_video", description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], requirement=["google-generativeai"], input_schema={ "video_path": { From 247ee92824ffc58a96b679e32c251764bdf99486 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:56:50 +0530 Subject: [PATCH 06/81] fix: wire independent VLM provider/model/key resolution and add availability guard --- app/agent_base.py | 19 +++++++++++++---- app/data/action/describe_image.py | 34 ++++++++++++++++++++++++++++--- app/data/action/generate_image.py | 2 +- app/main.py | 17 ++++++++++------ 4 files changed, 58 insertions(+), 14 deletions(-) diff --git a/app/agent_base.py b/app/agent_base.py index 8ee53288..8158dfcd 100644 --- a/app/agent_base.py +++ b/app/agent_base.py @@ -45,6 +45,8 @@ AGENT_FILE_SYSTEM_TEMPLATE_PATH, AGENT_MEMORY_CHROMA_PATH, PROCESS_MEMORY_AT_STARTUP, + get_api_key, + get_base_url, ) from app.internal_action_interface import InternalActionInterface @@ -124,6 +126,8 @@ def __init__( llm_api_key: str | None = None, llm_base_url: str | None = None, llm_model: str | None = None, + vlm_provider: str | None = None, + vlm_model: str | None = None, deferred_init: bool = False, ) -> None: """ @@ -139,6 +143,8 @@ def __init__( llm_api_key: API key for the LLM provider. llm_base_url: Base URL for the LLM provider (optional). llm_model: Model name override (None = use registry default). + vlm_provider: Provider name for VLM (defaults to llm_provider if None). + vlm_model: VLM model name override (None = use registry default). deferred_init: If True, allow LLM/VLM initialization to be deferred until API key is configured (useful for first-time setup). """ @@ -156,11 +162,16 @@ def __init__( base_url=llm_base_url, deferred=deferred_init, ) + # VLM uses its own provider/model settings, falling back to LLM values + _vlm_provider = vlm_provider or llm_provider + _vlm_api_key = get_api_key(_vlm_provider) if vlm_provider else llm_api_key + _vlm_base_url = get_base_url(_vlm_provider) if vlm_provider else llm_base_url + self.vlm = VLMInterface( - provider=llm_provider, - model=llm_model, - api_key=llm_api_key, - base_url=llm_base_url, + provider=_vlm_provider, + model=vlm_model, + api_key=_vlm_api_key, + base_url=_vlm_base_url, deferred=deferred_init, ) diff --git a/app/data/action/describe_image.py b/app/data/action/describe_image.py index abccca24..8e66ae7a 100644 --- a/app/data/action/describe_image.py +++ b/app/data/action/describe_image.py @@ -4,7 +4,7 @@ name="describe_image", description="Uses a Visual Language Model to analyse an image and return a detailed, markdown-ready description. IMPORTANT: Always provide a prompt describing what to look for or describe in the image.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], input_schema={ "image_path": { "type": "string", @@ -48,9 +48,36 @@ def view_image(input_data: dict) -> dict: prompt = str(input_data.get('prompt', '')).strip() or "Describe the content of this image in detail." if simulated_mode: - # Return mock result for testing return {'status': 'success', 'description': 'A simulated image description showing various objects and colors.', 'message': ''} + # ── VLM availability guard ────────────────────────────────────────── + import app.internal_action_interface as iai + from agent_core.core.models.model_registry import MODEL_REGISTRY + from agent_core.core.models.types import InterfaceType + from app.config import get_vlm_provider + + vlm = iai.InternalActionInterface.vlm_interface + current_provider = get_vlm_provider() + registry_vlm = MODEL_REGISTRY.get(current_provider, {}).get(InterfaceType.VLM) + + if vlm is None or not registry_vlm: + return { + 'status': 'error', + 'description': '', + 'message': ( + f"The current VLM provider '{current_provider}' does not support vision/image analysis. " + "Please inform the user and suggest switching to a provider that supports VLM.\n\n" + "Providers with VLM support: openai, anthropic, gemini, byteplus.\n\n" + "To switch provider, edit 'app/config/settings.json' and update:\n" + ' "vlm_provider": "" (e.g. "anthropic")\n' + ' "vlm_model": "" (e.g. "claude-sonnet-4-6" for anthropic)\n\n' + "Make sure the corresponding API key is configured under 'api_keys' in the same file. " + "If no API key is set, ask the user to provide one. " + "The system will automatically detect the config change and reload." + ), + } + # ─────────────────────────────────────────────────────────────────── + if not image_path: return {'status': 'error', 'description': '', 'message': 'image_path is required.'} @@ -58,8 +85,9 @@ def view_image(input_data: dict) -> dict: return {'status': 'error', 'description': '', 'message': 'File not found.'} try: - import app.internal_action_interface as iai description = iai.InternalActionInterface.describe_image(image_path, prompt) + if not description: + return {'status': 'error', 'description': '', 'message': 'VLM returned an empty description.'} return {'status': 'success', 'description': description, 'message': ''} except Exception as e: return {'status': 'error', 'description': '', 'message': str(e)} \ No newline at end of file diff --git a/app/data/action/generate_image.py b/app/data/action/generate_image.py index fde5dfae..751a2d5e 100644 --- a/app/data/action/generate_image.py +++ b/app/data/action/generate_image.py @@ -10,7 +10,7 @@ - TIP: When generating multiple images for the same project or related work, use 'reference_images' parameter with previously generated images to maintain consistent style across all outputs""", default=True, mode="CLI", - action_sets=["content_creation, image, document_processing"], + action_sets=["content_creation", "image", "document_processing"], input_schema={ "prompt": { "type": "string", diff --git a/app/main.py b/app/main.py index ce4e5dd4..418f3dd1 100644 --- a/app/main.py +++ b/app/main.py @@ -67,7 +67,7 @@ def _suppress_console_logging_early() -> None: ConfigRegistry.register_workspace_root(".") # Import settings reader (reads directly from settings.json) -from app.config import get_llm_provider, get_api_key, get_base_url, get_llm_model +from app.config import get_llm_provider, get_vlm_provider, get_api_key, get_base_url, get_llm_model, get_vlm_model from app.agent_base import AgentBase @@ -110,12 +110,12 @@ def _parse_cli_args() -> dict: return vars(args) -def _initial_settings() -> tuple[str, str, str, bool]: +def _initial_settings() -> tuple: """Determine initial provider, API key, and base URL from settings.json. Returns: - Tuple of (provider, api_key, base_url, has_valid_key) where has_valid_key - indicates if a working API key was found. + Tuple of (provider, api_key, base_url, model, vlm_provider, vlm_model, has_valid_key) + where has_valid_key indicates if a working API key was found. """ # Read directly from settings.json provider = get_llm_provider() @@ -126,7 +126,10 @@ def _initial_settings() -> tuple[str, str, str, bool]: # Remote (Ollama) doesn't require API key has_key = bool(api_key) or provider == "remote" - return provider, api_key, base_url, model, has_key + vlm_prov = get_vlm_provider() # falls back to llm_provider if not set + vlm_mod = get_vlm_model() # falls back to registry default if None + + return provider, api_key, base_url, model, vlm_prov, vlm_mod, has_key async def main_async() -> None: @@ -136,7 +139,7 @@ async def main_async() -> None: browser_mode = cli_args.get("browser", False) # Get settings from settings.json - provider, api_key, base_url, model, has_valid_key = _initial_settings() + provider, api_key, base_url, model, vlm_prov, vlm_mod, has_valid_key = _initial_settings() # CLI args override settings.json if provided if cli_args.get("provider"): @@ -159,6 +162,8 @@ async def main_async() -> None: llm_api_key=api_key, llm_base_url=base_url, llm_model=model, + vlm_provider=vlm_prov, + vlm_model=vlm_mod, deferred_init=not has_valid_key, ) From f00ae32d15b3c8c17210cfb2c99f24bcf5fa8850 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Fri, 17 Apr 2026 16:40:04 +0530 Subject: [PATCH 07/81] refactor(vlm): unify multimodal, deduplicate OCR path, dynamic video model - Merge generate_multimodal_multi_image into generate_multimodal (image_bytes_list param) - Add json_mode param to describe_image_bytes; describe_image_ocr now a thin wrapper - understand_video pulls model from get_vlm_model() with gemini-1.5-pro fallback - Add test suites: gemini_client_multimodal, vlm_json_mode, ocr_wrapper, understand_video_model --- agent_core/core/impl/vlm/interface.py | 33 +++---- agent_core/core/llm/google_gemini_client.py | 99 +++++---------------- app/data/action/understand_video.py | 7 +- tests/test_gemini_client_multimodal.py | 49 ++++++++++ tests/test_step1_vlm_interface.py | 21 ++--- tests/test_understand_video_model.py | 49 ++++++++++ tests/test_vlm_interface_json_mode.py | 53 +++++++++++ tests/test_vlm_ocr_wrapper.py | 52 +++++++++++ 8 files changed, 255 insertions(+), 108 deletions(-) create mode 100644 tests/test_gemini_client_multimodal.py create mode 100644 tests/test_understand_video_model.py create mode 100644 tests/test_vlm_interface_json_mode.py create mode 100644 tests/test_vlm_ocr_wrapper.py diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index b4c7aed4..1ddd401b 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -217,6 +217,7 @@ def describe_image_bytes( system_prompt: str | None = None, user_prompt: str | None = "Describe this image in detail.", log_response: bool = True, + json_mode: bool = True, ) -> str: """Describe an image from raw bytes using the VLM. @@ -234,7 +235,10 @@ def describe_image_bytes( logger.info(f"[LLM SEND] system={system_prompt} | user={user_prompt}") if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): - response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) + if json_mode: + response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) + else: + response = self._openai_describe_bytes_plain(image_bytes, system_prompt, user_prompt) elif self.provider == "remote": response = self._ollama_describe_bytes(image_bytes, system_prompt, user_prompt) elif self.provider == "gemini": @@ -311,24 +315,13 @@ def describe_image_ocr( logger.info(f"[LLM SEND] OCR request | path={image_path}") - if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): - response = self._openai_describe_bytes_plain(image_bytes, system_prompt, effective_user) - elif self.provider == "remote": - response = self._ollama_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "gemini": - response = self._gemini_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "byteplus": - response = self._byteplus_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "anthropic": - response = self._anthropic_describe_bytes(image_bytes, system_prompt, effective_user) - else: - raise RuntimeError(f"Unknown provider {self.provider!r}") - - cleaned = re.sub(self._CODE_BLOCK_RE, "", response.get("content", "").strip()) - - tokens_used = response.get("tokens_used", 0) - if tokens_used: - self._set_token_count(self._get_token_count() + tokens_used) + cleaned = self.describe_image_bytes( + image_bytes, + system_prompt=system_prompt, + user_prompt=effective_user, + log_response=False, # Logged below + json_mode=False, + ) logger.info(f"[LLM RECV OCR] {cleaned[:120]}...") return cleaned @@ -451,7 +444,7 @@ def _gemini_describe_video_frames( self, frame_bytes_list: list[bytes], sys: str | None, usr: str ) -> str: """Gemini-specific multi-image frame analysis in a single API call.""" - result = self._gemini_client.generate_multimodal_multi_image( + result = self._gemini_client.generate_multimodal( self.model, text=usr, image_bytes_list=frame_bytes_list, diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index 3cbffe44..36ae2f21 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -168,12 +168,16 @@ def generate_multimodal( model: str, *, text: str, - image_bytes: bytes, + image_bytes: Optional[bytes] = None, + image_bytes_list: Optional[List[bytes]] = None, system_prompt: Optional[str] = None, temperature: Optional[float] = None, json_mode: bool = False, ) -> Dict[str, Any]: - """Generate text from a prompt that also contains an inline image. + """Generate text from a prompt that contains one or more inline images. + + Normalises both single-image and multi-image inputs into a consistent + request format for the Gemini API. Returns a dict containing: - tokens_used: Total tokens consumed @@ -185,7 +189,8 @@ def generate_multimodal( Args: model: Model identifier text: The text prompt - image_bytes: PNG image data + image_bytes: Single PNG image data (for backward compatibility) + image_bytes_list: List of image data (PNG/JPEG) system_prompt: Optional system instruction temperature: Sampling temperature json_mode: If True, enforce JSON output format @@ -193,80 +198,22 @@ def generate_multimodal( Returns: Dict with generation results and token counts """ - inline_data = { - "mimeType": "image/png", - "data": base64.b64encode(image_bytes).decode("utf-8"), - } - - parts: List[Dict[str, Any]] = [{"text": text}, {"inlineData": inline_data}] - contents = [{"role": "user", "parts": parts}] - - payload: Dict[str, Any] = {"contents": contents} - if system_prompt: - payload["systemInstruction"] = { - "parts": [{"text": system_prompt}], - } - - generation_config: Dict[str, Any] = {} - if temperature is not None: - generation_config["temperature"] = temperature - if json_mode: - generation_config["responseMimeType"] = "application/json" - if generation_config: - payload["generationConfig"] = generation_config - - response = self._post_json( - f"{_normalise_model_name(model)}:generateContent", payload - ) - - # Extract token usage from usageMetadata - usage_metadata = response.get("usageMetadata", {}) - total_tokens = usage_metadata.get("totalTokenCount", 0) - prompt_tokens = usage_metadata.get("promptTokenCount", 0) - completion_tokens = usage_metadata.get("candidatesTokenCount", 0) - cached_tokens = usage_metadata.get("cachedContentTokenCount", 0) - - content = self._extract_text(response) + # Normalise: single image wraps into list; list takes priority if both provided + images = image_bytes_list if image_bytes_list is not None else ([image_bytes] if image_bytes else []) + if not images: + raise ValueError("At least one of `image_bytes` or `image_bytes_list` must be provided.") - return { - "tokens_used": total_tokens, - "content": content, - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "cached_tokens": cached_tokens, - } - - def generate_multimodal_multi_image( - self, - model: str, - *, - text: str, - image_bytes_list: List[bytes], - system_prompt: Optional[str] = None, - temperature: Optional[float] = None, - json_mode: bool = False, - ) -> Dict[str, Any]: - """Generate text from a prompt that contains multiple inline images. - - Args: - model: Model identifier - text: The text prompt - image_bytes_list: List of PNG/JPEG image data - system_prompt: Optional system instruction - temperature: Sampling temperature - json_mode: If True, enforce JSON output format - - Returns: - Dict with generation results and token counts - """ parts: List[Dict[str, Any]] = [{"text": text}] - - for image_bytes in image_bytes_list: - inline_data = { - "mimeType": "image/jpeg", - "data": base64.b64encode(image_bytes).decode("utf-8"), - } - parts.append({"inlineData": inline_data}) + for img in images: + # Preserve existing mime-type logic: single-image callers stay PNG index, + # multi-image callers (video frames) use JPEG. + mime = "image/jpeg" if image_bytes_list is not None else "image/png" + parts.append({ + "inlineData": { + "mimeType": mime, + "data": base64.b64encode(img).decode("utf-8"), + } + }) contents = [{"role": "user", "parts": parts}] @@ -305,6 +252,8 @@ def generate_multimodal_multi_image( "cached_tokens": cached_tokens, } + + def embed_text(self, model: str, *, text: str) -> List[float]: """Fetch an embedding vector for the supplied text. diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index 8c280419..12a19804 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -2,7 +2,7 @@ @action( name="understand_video", - description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", + description="Uses the configured VLM model (default: Gemini 1.5 Pro) for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", action_sets=["document_processing", "image"], requirement=["google-generativeai"], @@ -80,7 +80,7 @@ def understand_video(input_data: dict) -> dict: if not os.path.isfile(video_path): return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} - from app.config import get_api_key + from app.config import get_api_key, get_vlm_model api_key = get_api_key('gemini') if api_key: @@ -97,7 +97,8 @@ def understand_video(input_data: dict) -> dict: time.sleep(2) video_file = genai.get_file(video_file.name) - model = genai.GenerativeModel("gemini-1.5-pro") + vlm_model = get_vlm_model() or "gemini-1.5-pro" + model = genai.GenerativeModel(vlm_model) prompt = query if query else "Understand and describe the contents of this video." response = model.generate_content([video_file, prompt]) diff --git a/tests/test_gemini_client_multimodal.py b/tests/test_gemini_client_multimodal.py new file mode 100644 index 00000000..16c7b5fb --- /dev/null +++ b/tests/test_gemini_client_multimodal.py @@ -0,0 +1,49 @@ +import base64 +import pytest +from unittest.mock import patch, MagicMock +from agent_core.core.llm.google_gemini_client import GeminiClient + +FAKE_RESPONSE = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2} +} + +@pytest.fixture +def client(): + return GeminiClient(api_key="fake-key") + +def test_single_image_produces_one_inlinedata_part(client): + """Passing image_bytes alone → exactly 1 inlineData in parts.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: + client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img1") + # mock_post.call_args.args[1] is the payload + payload = mock_post.call_args.args[1] + parts = payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + assert len(inline_parts) == 1 + +def test_multi_image_produces_correct_count(client): + """Passing image_bytes_list of N images → exactly N inlineData parts.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: + client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes_list=[b"a", b"b", b"c"]) + payload = mock_post.call_args.args[1] + parts = payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + assert len(inline_parts) == 3 + +def test_neither_image_raises_valueerror(client): + """Passing neither image_bytes nor image_bytes_list → ValueError.""" + with pytest.raises(ValueError): + client.generate_multimodal("gemini-2.0-flash", text="hi") + +def test_single_image_backwards_compat_response(client): + """Single-image call returns same response structure as before the refactor.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE): + result = client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img") + assert result["content"] == "ok" + assert result["tokens_used"] == 10 + +def test_generate_multimodal_multi_image_no_longer_exists(client): + """The old method must be gone.""" + assert not hasattr(client, "generate_multimodal_multi_image"), \ + "generate_multimodal_multi_image was not removed" diff --git a/tests/test_step1_vlm_interface.py b/tests/test_step1_vlm_interface.py index c1bf516f..88937c8c 100644 --- a/tests/test_step1_vlm_interface.py +++ b/tests/test_step1_vlm_interface.py @@ -42,14 +42,13 @@ def _make_client(self): client._timeout = 30 return client - def test_method_exists(self): - """generate_multimodal_multi_image must exist on GeminiClient.""" + def test_method_accepts_list(self): + """generate_multimodal must accept image_bytes_list.""" from agent_core.core.llm.google_gemini_client import GeminiClient - self.assertTrue( - hasattr(GeminiClient, "generate_multimodal_multi_image"), - "FAIL: GeminiClient.generate_multimodal_multi_image not found. " - "Add it to agent_core/core/llm/google_gemini_client.py" - ) + import inspect + sig = inspect.signature(GeminiClient.generate_multimodal) + self.assertIn("image_bytes_list", sig.parameters, + "FAIL: GeminiClient.generate_multimodal does not accept image_bytes_list.") def test_payload_contains_multiple_inline_data_parts(self): """The API payload must contain one inlineData entry per frame passed in.""" @@ -68,7 +67,7 @@ def fake_post(path, payload): client._post_json = fake_post frame_bytes = [b"frame1_bytes", b"frame2_bytes", b"frame3_bytes"] - result = client.generate_multimodal_multi_image( + result = client.generate_multimodal( "gemini-2.5-flash", text="What is happening?", image_bytes_list=frame_bytes, @@ -109,7 +108,7 @@ def test_system_prompt_is_included(self): captured = {} client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - client.generate_multimodal_multi_image( + client.generate_multimodal( "gemini-2.5-flash", text="Describe", image_bytes_list=[b"img"], @@ -128,7 +127,7 @@ def test_no_system_prompt_omits_key(self): captured = {} client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - client.generate_multimodal_multi_image( + client.generate_multimodal( "gemini-2.5-flash", text="Describe", image_bytes_list=[b"img"], @@ -527,6 +526,7 @@ def test_describe_image_bytes_returns_string(self): mock_response.choices = [mock_choice] mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 + mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak vlm.client = MagicMock() vlm.client.chat.completions.create.return_value = mock_response @@ -547,6 +547,7 @@ def test_describe_image_bytes_uses_json_response_format(self): mock_response.choices = [mock_choice] mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 + mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak vlm.client = MagicMock() vlm.client.chat.completions.create.return_value = mock_response diff --git a/tests/test_understand_video_model.py b/tests/test_understand_video_model.py new file mode 100644 index 00000000..cd69dc3b --- /dev/null +++ b/tests/test_understand_video_model.py @@ -0,0 +1,49 @@ +import pytest +from unittest.mock import patch, MagicMock +import os + +def test_understand_video_uses_configured_model(): + """understand_video must use get_vlm_model(), not hardcode gemini-1.5-pro.""" + mock_file = MagicMock() + mock_file.state.name = "ACTIVE" + mock_model_instance = MagicMock() + mock_model_instance.generate_content.return_value = MagicMock(text="video summary") + + with patch("os.path.isfile", return_value=True), \ + patch("app.config.get_api_key", return_value="fake-key"), \ + patch("app.config.get_vlm_model", return_value="gemini-2.0-flash") as mock_get_model, \ + patch("google.generativeai.configure"), \ + patch("google.generativeai.upload_file", return_value=mock_file), \ + patch("google.generativeai.get_file", return_value=mock_file), \ + patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ + patch("google.generativeai.delete_file"), \ + patch("builtins.open", MagicMock()), \ + patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): + from app.data.action.understand_video import understand_video + understand_video({"video_path": "/fake/video.mp4"}) + called_model_name = mock_gm.call_args[0][0] + assert called_model_name == "gemini-2.0-flash", \ + f"Expected gemini-2.0-flash from config, got {called_model_name}" + +def test_understand_video_falls_back_when_config_missing(): + """If get_vlm_model() returns None, fall back to gemini-1.5-pro.""" + mock_file = MagicMock() + mock_file.state.name = "ACTIVE" + mock_model_instance = MagicMock() + mock_model_instance.generate_content.return_value = MagicMock(text="summary") + + with patch("os.path.isfile", return_value=True), \ + patch("app.config.get_api_key", return_value="fake-key"), \ + patch("app.config.get_vlm_model", return_value=None), \ + patch("google.generativeai.configure"), \ + patch("google.generativeai.upload_file", return_value=mock_file), \ + patch("google.generativeai.get_file", return_value=mock_file), \ + patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ + patch("google.generativeai.delete_file"), \ + patch("builtins.open", MagicMock()), \ + patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): + from app.data.action.understand_video import understand_video + understand_video({"video_path": "/fake/video.mp4"}) + called_model_name = mock_gm.call_args[0][0] + assert called_model_name == "gemini-1.5-pro", \ + f"Expected fallback gemini-1.5-pro, got {called_model_name}" diff --git a/tests/test_vlm_interface_json_mode.py b/tests/test_vlm_interface_json_mode.py new file mode 100644 index 00000000..3d38495c --- /dev/null +++ b/tests/test_vlm_interface_json_mode.py @@ -0,0 +1,53 @@ +import pytest +from unittest.mock import MagicMock, patch +from agent_core.core.impl.vlm.interface import VLMInterface + +PLAIN_RESPONSE = {"content": "raw text output", "tokens_used": 5} + +def _make_vlm(provider="openai"): + """Create a VLMInterface with mocked internals.""" + with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): + vlm = VLMInterface.__new__(VLMInterface) + vlm.provider = provider + vlm.model = "gpt-4o" + vlm.temperature = 0.5 + vlm._get_token_count = lambda: 0 + vlm._set_token_count = lambda x: None + vlm._report_usage = None + vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE + return vlm + +def test_openai_json_mode_true_uses_json_method(): + """describe_image_bytes with json_mode=True (default) → _openai_describe_bytes.""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=True) + vlm._openai_describe_bytes.assert_called_once() + vlm._openai_describe_bytes_plain.assert_not_called() + +def test_openai_json_mode_false_uses_plain_method(): + """describe_image_bytes with json_mode=False → _openai_describe_bytes_plain.""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=False) + vlm._openai_describe_bytes_plain.assert_called_once() + vlm._openai_describe_bytes.assert_not_called() + +def test_default_json_mode_is_true(): + """Calling describe_image_bytes without json_mode defaults to True (no regression).""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img") # no json_mode arg + vlm._openai_describe_bytes.assert_called_once() + +def test_gemini_unaffected_by_json_mode(): + """Gemini always uses _gemini_describe_bytes regardless of json_mode flag.""" + vlm = _make_vlm("gemini") + vlm._gemini_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=False) + vlm._gemini_describe_bytes.assert_called_once() + vlm.describe_image_bytes(b"img", json_mode=True) + assert vlm._gemini_describe_bytes.call_count == 2 diff --git a/tests/test_vlm_ocr_wrapper.py b/tests/test_vlm_ocr_wrapper.py new file mode 100644 index 00000000..8e12846d --- /dev/null +++ b/tests/test_vlm_ocr_wrapper.py @@ -0,0 +1,52 @@ +import os +import pytest +import tempfile +from unittest.mock import MagicMock, patch +from agent_core.core.impl.vlm.interface import VLMInterface + +def _make_vlm(): + with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): + vlm = VLMInterface.__new__(VLMInterface) + vlm.provider = "openai" + vlm.model = "gpt-4o" + vlm.temperature = 0.5 + vlm._get_token_count = lambda: 0 + vlm._set_token_count = lambda x: None + vlm._report_usage = None + vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE + return vlm + +def test_ocr_calls_describe_image_bytes_with_json_mode_false(tmp_path): + """describe_image_ocr must delegate to describe_image_bytes with json_mode=False.""" + img_file = tmp_path / "test.png" + img_file.write_bytes(b"fakeimgdata") + vlm = _make_vlm() + vlm.describe_image_bytes = MagicMock(return_value="extracted text") + vlm.describe_image_ocr(str(img_file)) + call_kwargs = vlm.describe_image_bytes.call_args.kwargs + assert call_kwargs.get("json_mode") == False, \ + "describe_image_ocr must pass json_mode=False" + +def test_ocr_system_prompt_is_ocr_focused(tmp_path): + """The system prompt passed by OCR must mention OCR/extraction, not description.""" + img_file = tmp_path / "test.png" + img_file.write_bytes(b"fakeimgdata") + vlm = _make_vlm() + vlm.describe_image_bytes = MagicMock(return_value="text") + vlm.describe_image_ocr(str(img_file)) + sys_prompt = vlm.describe_image_bytes.call_args.kwargs.get("system_prompt", "") + assert "OCR" in sys_prompt or "extract" in sys_prompt.lower() + +def test_ocr_no_provider_routing_in_method(): + """describe_image_ocr source must not contain a provider routing switch.""" + import inspect + src = inspect.getsource(VLMInterface.describe_image_ocr) + assert "self.provider" not in src, \ + "describe_image_ocr still contains provider routing — refactor incomplete" + assert "elif self.provider ==" not in src, \ + "describe_image_ocr still contains provider routing switch" + +def test_ocr_raises_on_missing_file(): + vlm = _make_vlm() + with pytest.raises(FileNotFoundError): + vlm.describe_image_ocr("/nonexistent/path/image.png") From 3d08d8dbfba2b8f63d5802ade3400b71b840cf4c Mon Sep 17 00:00:00 2001 From: Korivi Date: Mon, 20 Apr 2026 12:11:52 +0900 Subject: [PATCH 08/81] Issues #207, #191 , and the LLM provider to local LLM switching direction now workcorrectly end-to-end. Issue fixed. Fixes: 1. STT Language switching screens issue fixed 2. Model fails issue fixes with auto retry, and also after auto retry, chat messages appear: the error text + a "What would you like to do?" message with Retry / Change Model buttons. 3. Added provider-switching directions work correctly end-to-end (Between provider LLLMs& Local LLMs ). Made full chain: Frontend save, backend validate, settings write, cache invalidate. --- app/agent_base.py | 31 +++++++++++++++++++ app/ui_layer/adapters/base.py | 21 +++++++++++++ .../frontend/src/pages/Chat/ChatPage.tsx | 17 ++++++++-- .../src/pages/Settings/ModelSettings.tsx | 20 ++++++++++-- app/ui_layer/controller/ui_controller.py | 2 ++ app/ui_layer/events/event_types.py | 1 + app/ui_layer/settings/model_settings.py | 4 +++ 7 files changed, 90 insertions(+), 6 deletions(-) diff --git a/app/agent_base.py b/app/agent_base.py index aa1b85de..94709a5a 100644 --- a/app/agent_base.py +++ b/app/agent_base.py @@ -154,6 +154,9 @@ def __init__( data_dir = data_dir, chroma_path=chroma_path ) + # Stores original task instructions keyed by session_id for LLM retry after failure + self._llm_retry_instructions: dict[str, str] = {} + # LLM + prompt plumbing (may be deferred if API key not yet configured) self.llm = LLMInterface( provider=llm_provider, @@ -1250,10 +1253,23 @@ async def _handle_react_error( f"[REACT ERROR] LLMConsecutiveFailureError detected - cancelling task {session_to_use} " "to prevent infinite retry loop." ) + # Cache instruction BEFORE cancellation removes task from tasks dict + failed_task = self.task_manager.tasks.get(session_to_use) if self.task_manager else None + if failed_task: + self._llm_retry_instructions[session_to_use] = failed_task.instruction if self.task_manager: await self.task_manager.mark_task_cancel( reason="LLM calls failed too many consecutive times. Task aborted." ) + if self.ui_controller: + from app.ui_layer.events import UIEvent, UIEventType + self.ui_controller.event_bus.emit( + UIEvent( + type=UIEventType.LLM_FATAL_ERROR, + data={"session_id": session_to_use}, + task_id=session_to_use, + ) + ) else: await self._create_new_trigger(session_to_use, action_output, STATE) except Exception as e: @@ -1508,6 +1524,21 @@ async def handle_limit_abort(self, session_id: str) -> None: task_id=session_id, ) + async def handle_llm_retry(self, session_id: str) -> None: + """Retry the original task after a fatal LLM failure. Resets the failure counter and re-submits.""" + instruction = self._llm_retry_instructions.pop(session_id, None) + if not instruction: + logger.warning(f"[LLM_RETRY] Cannot retry: no cached instruction for session {session_id}") + return + + try: + self.llm.reset_failure_counter() + except Exception as e: + logger.debug(f"[LLM_RETRY] Could not reset failure counter: {e}") + + if self.ui_controller: + await self.ui_controller.submit_message(instruction) + # ----- Trigger Management ----- async def _cleanup_session_triggers(self, session_id: str) -> None: diff --git a/app/ui_layer/adapters/base.py b/app/ui_layer/adapters/base.py index 13dfdefc..117e7fba 100644 --- a/app/ui_layer/adapters/base.py +++ b/app/ui_layer/adapters/base.py @@ -205,6 +205,9 @@ def _subscribe_events(self) -> None: self._unsubscribers.append( bus.subscribe(UIEventType.ERROR_MESSAGE, self._handle_error_message) ) + self._unsubscribers.append( + bus.subscribe(UIEventType.LLM_FATAL_ERROR, self._handle_llm_fatal_error) + ) self._unsubscribers.append( bus.subscribe(UIEventType.INFO_MESSAGE, self._handle_info_message) ) @@ -307,6 +310,24 @@ def _handle_error_message(self, event: UIEvent) -> None: self._display_chat_message("Error", event.data.get("message", ""), "error") ) + def _handle_llm_fatal_error(self, event: UIEvent) -> None: + """Handle fatal LLM consecutive failure — show retry/change-model options.""" + from app.ui_layer.components.types import ChatMessageOption + session_id = event.data.get("session_id") + options = [ + ChatMessageOption(label="Retry", value="llm_retry", style="primary"), + ChatMessageOption(label="Change Model", value="llm_change_model", style="default"), + ] + asyncio.create_task( + self._display_chat_message( + "System", + "What would you like to do?", + "system", + task_session_id=session_id, + options=options, + ) + ) + def _handle_info_message(self, event: UIEvent) -> None: """Handle info message event.""" asyncio.create_task( diff --git a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx index 756a1ad9..3bcb6169 100644 --- a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx +++ b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx @@ -1,7 +1,7 @@ import React, { useState, useRef, useEffect, useLayoutEffect, KeyboardEvent, useCallback, ChangeEvent, useMemo } from 'react' import { Send, Paperclip, X, Loader2, File, AlertCircle, Reply, Mic, MicOff } from 'lucide-react' import { useVirtualizer } from '@tanstack/react-virtual' -import { useLocation } from 'react-router-dom' +import { useLocation, useNavigate } from 'react-router-dom' import { useWebSocket } from '../../contexts/WebSocketContext' import { Button, IconButton, StatusIndicator } from '../../components/ui' import { useDerivedAgentStatus } from '../../hooks' @@ -54,6 +54,15 @@ const formatFileSize = (bytes: number): string => { export function ChatPage() { const { messages, actions, connected, sendMessage, cancelTask, cancellingTaskId, openFile, openFolder, lastSeenMessageId, markMessagesAsSeen, replyTarget, setReplyTarget, clearReplyTarget, loadOlderMessages, hasMoreMessages, loadingOlderMessages, sendOptionClick } = useWebSocket() + const navigate = useNavigate() + + const handleOptionClick = useCallback((value: string, sessionId?: string, messageId?: string) => { + if (value === 'llm_change_model') { + navigate('/settings') + return + } + sendOptionClick(value, sessionId, messageId) + }, [navigate, sendOptionClick]) // Derive agent status from actions and messages const status = useDerivedAgentStatus({ @@ -74,6 +83,8 @@ export function ChatPage() { const [isListening, setIsListening] = useState(false) const recognitionRef = useRef(null) const [micLang, setMicLang] = useState(() => { + const saved = localStorage.getItem('micLang') + if (saved && MIC_LANGUAGES.some(l => l.code === saved)) return saved const browserLang = navigator.language || 'en-US' return MIC_LANGUAGES.some(l => l.code === browserLang) ? browserLang : 'en-US' }) @@ -556,7 +567,7 @@ export function ChatPage() { onOpenFile={openFile} onOpenFolder={openFolder} onReply={handleChatReply} - onOptionClick={sendOptionClick} + onOptionClick={handleOptionClick} />
) @@ -614,7 +625,7 @@ export function ChatPage() {
) -}, (prev, next) => prev.message.messageId === next.message.messageId) +}, (prev, next) => + prev.message.messageId === next.message.messageId + && prev.message.optionSelected === next.message.optionSelected + && prev.message.content === next.message.content +) diff --git a/app/ui_layer/browser/frontend/src/pages/Settings/SettingsPage.module.css b/app/ui_layer/browser/frontend/src/pages/Settings/SettingsPage.module.css index 9f9adeae..0ff26105 100644 --- a/app/ui_layer/browser/frontend/src/pages/Settings/SettingsPage.module.css +++ b/app/ui_layer/browser/frontend/src/pages/Settings/SettingsPage.module.css @@ -1011,6 +1011,13 @@ flex-direction: column; } +.modalContent form { + display: flex; + flex-direction: column; + flex: 1; + min-height: 0; +} + .modalHeader { display: flex; align-items: center; From 91978c88d0191a6433c2c881ca9f2626cec8dec7 Mon Sep 17 00:00:00 2001 From: zfoong Date: Mon, 20 Apr 2026 13:51:13 +0900 Subject: [PATCH 10/81] bug:fix attachment item alignment issue in chat panel --- .../frontend/src/pages/Chat/ChatMessage.tsx | 18 +++++++++--------- .../src/pages/Chat/ChatPage.module.css | 6 +++++- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/app/ui_layer/browser/frontend/src/pages/Chat/ChatMessage.tsx b/app/ui_layer/browser/frontend/src/pages/Chat/ChatMessage.tsx index 631ae0a1..678a94c2 100644 --- a/app/ui_layer/browser/frontend/src/pages/Chat/ChatMessage.tsx +++ b/app/ui_layer/browser/frontend/src/pages/Chat/ChatMessage.tsx @@ -112,6 +112,15 @@ export const ChatMessageItem = memo(function ChatMessageItem({
)} + {message.attachments && message.attachments.length > 0 && ( +
+ +
+ )} {/* Reply button - positioned outside the bubble at top-right */} {canReply && isHovered && ( 0 && ( -
- -
- )} ) }, (prev, next) => diff --git a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css index e09ed85c..ef210492 100644 --- a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css +++ b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css @@ -748,9 +748,13 @@ padding-right: var(--space-8); } -/* Message bubble container - wraps bubble + reply button */ +/* Message bubble container - wraps bubble + attachments + reply button */ .messageBubbleContainer { position: relative; + display: flex; + flex-direction: column; + gap: var(--space-2); + min-width: 0; } /* Reply button outside the bubble - positioned in the padding area */ From ef4e771a9e497e90d0f4ac1f4e8998f35da02701 Mon Sep 17 00:00:00 2001 From: Korivi Date: Mon, 20 Apr 2026 14:13:32 +0900 Subject: [PATCH 11/81] Md's no size limit Issued Fixed: Compress old events, keep recent ones verbatim, the agent always has fresh, recent context with no file rename and no prompt changed no cognitive load increase. Key point is: old events near the top of the file are already low-value history. The agent cares most about what just happened, and those lines are always preserved like a rolling memory. - No .old.md file is ever created so no prompt changes needed... - EVENT.md stays as EVENT.md so agent reads it exactly as before - When file hits 10 MB, we drop oldest 1/3, keep newest 2/3 in place so Agent always sees the most recent events so no cutoff ! --- agent_core/core/impl/event_stream/manager.py | 3 +++ agent_core/core/impl/task/manager.py | 2 ++ agent_core/utils/file_utils.py | 23 ++++++++++++++++++++ app/state/state_manager.py | 2 ++ 4 files changed, 30 insertions(+) create mode 100644 agent_core/utils/file_utils.py diff --git a/agent_core/core/impl/event_stream/manager.py b/agent_core/core/impl/event_stream/manager.py index 69e334ca..668f8af3 100644 --- a/agent_core/core/impl/event_stream/manager.py +++ b/agent_core/core/impl/event_stream/manager.py @@ -22,6 +22,7 @@ from agent_core.core.event_stream.event import Event from agent_core.core.protocols.llm import LLMInterfaceProtocol from agent_core.utils.logger import logger +from agent_core.utils.file_utils import rotate_md_file_if_needed from agent_core.core.state.base import get_state_or_none # Import memory mode check (deferred to avoid circular imports) @@ -298,6 +299,7 @@ def _log_to_files(self, kind: str, message: str) -> None: # Always write to EVENT.md (create if doesn't exist) try: event_file = self._agent_file_system_path / "EVENT.md" + rotate_md_file_if_needed(event_file) with open(event_file, "a", encoding="utf-8") as f: f.write(event_line) except Exception as e: @@ -309,6 +311,7 @@ def _log_to_files(self, kind: str, message: str) -> None: if not self._should_skip_unprocessed() and not self._should_skip_event_type(kind): try: unprocessed_file = self._agent_file_system_path / "EVENT_UNPROCESSED.md" + rotate_md_file_if_needed(unprocessed_file) with open(unprocessed_file, "a", encoding="utf-8") as f: f.write(event_line) except Exception as e: diff --git a/agent_core/core/impl/task/manager.py b/agent_core/core/impl/task/manager.py index 0e388374..f32ff234 100644 --- a/agent_core/core/impl/task/manager.py +++ b/agent_core/core/impl/task/manager.py @@ -41,6 +41,7 @@ # Set up logger - use shared agent_core logger for consistency from agent_core.utils.logger import logger +from agent_core.utils.file_utils import rotate_md_file_if_needed # ============================================================================= @@ -732,6 +733,7 @@ def _log_to_task_history(self, task: Task, note: Optional[str] = None) -> None: entry_lines.append("") + rotate_md_file_if_needed(task_history_path) with open(task_history_path, "a", encoding="utf-8") as f: f.write("\n".join(entry_lines) + "\n") diff --git a/agent_core/utils/file_utils.py b/agent_core/utils/file_utils.py new file mode 100644 index 00000000..6cbbdca3 --- /dev/null +++ b/agent_core/utils/file_utils.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +"""File utility helpers for agent-core.""" + +from pathlib import Path + +# Maximum size (bytes) for append-only MD logs before trimming (default: 10 MB) +MAX_MD_FILE_BYTES = 10 * 1024 * 1024 + + +def rotate_md_file_if_needed(file_path: Path, max_bytes: int = MAX_MD_FILE_BYTES) -> None: + """Drop the oldest 1/3 of lines from *file_path* when it exceeds *max_bytes*. + + The file is trimmed in-place: the most recent 2/3 of lines are kept so the + agent never loses recent context and no extra archive files are created. + """ + try: + if not file_path.exists() or file_path.stat().st_size < max_bytes: + return + lines = file_path.read_text(encoding="utf-8").splitlines(keepends=True) + keep_from = len(lines) // 3 # drop oldest 1/3, keep newest 2/3 + file_path.write_text("".join(lines[keep_from:]), encoding="utf-8") + except Exception: + pass # Never block a write due to trim failure diff --git a/app/state/state_manager.py b/app/state/state_manager.py index e122a1c0..30feda42 100644 --- a/app/state/state_manager.py +++ b/app/state/state_manager.py @@ -3,6 +3,7 @@ from pathlib import Path from agent_core.core.state.types import MainState from agent_core.core.state.session import StateSession +from agent_core.utils.file_utils import rotate_md_file_if_needed from app.state.types import AgentProperties from app.state.agent_state import STATE from app.event_stream import EventStreamManager @@ -197,6 +198,7 @@ def _append_to_conversation_history(self, sender: str, content: str) -> None: """ try: conversation_file = Path(AGENT_FILE_SYSTEM_PATH) / "CONVERSATION_HISTORY.md" + rotate_md_file_if_needed(conversation_file) timestamp = datetime.now().strftime("%Y/%m/%d %H:%M:%S") entry = f"[{timestamp}] [{sender}]: {content}\n" From 93d99deb21f00496947df86844ac73178637e021 Mon Sep 17 00:00:00 2001 From: zfoong Date: Mon, 20 Apr 2026 19:27:48 +0900 Subject: [PATCH 12/81] Readme: more languages support --- README.cn.md | 4 +- README.es.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ README.ja.md | 4 +- README.ko.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 4 +- README.zh-TW.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1467 insertions(+), 3 deletions(-) create mode 100644 README.es.md create mode 100644 README.ko.md create mode 100644 README.zh-TW.md diff --git a/README.cn.md b/README.cn.md index cde213ea..481581b7 100644 --- a/README.cn.md +++ b/README.cn.md @@ -22,10 +22,12 @@
[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +
CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt

- English README | 日本語版はこちら + English | 日本語 | 繁體中文 | 한국어 | Español

## 🚀 概览 diff --git a/README.es.md b/README.es.md new file mode 100644 index 00000000..9a36feea --- /dev/null +++ b/README.es.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 繁體中文 | 한국어 +

+ +## 🚀 Descripción general +

+CraftBot es tu Asistente de IA Personal que vive dentro de tu máquina y trabaja 24/7 para ti. +

+ +Interpreta tareas de forma autónoma, planifica acciones y las ejecuta para alcanzar tus objetivos. +Aprende tus preferencias y metas, y te ayuda de manera proactiva a planificar e iniciar tareas para cumplir tus objetivos de vida. +Soporta MCP, Skills e integraciones con apps externas. + +CraftBot espera tus órdenes. Configura tu propio CraftBot ahora. + +
+ CraftBot Overview +
+ +--- + +## ✨ Características + +- **Bring Your Own Key (BYOK)** — Sistema flexible de proveedores de LLM con soporte para OpenAI, Google Gemini, Anthropic Claude, BytePlus y modelos locales de Ollama. Cambia entre proveedores fácilmente. +- **Sistema de Memoria** — Destila y consolida los eventos del día cada medianoche. +- **Agente Proactivo** — Aprende tus preferencias, hábitos y metas de vida. Luego planifica e inicia tareas (con tu aprobación, por supuesto) para ayudarte a mejorar en la vida. +- **Integración con herramientas externas** — Conéctate a Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord y Telegram (¡vendrán más!) con credenciales integradas y soporte OAuth. +- **MCP** — Integración con Model Context Protocol para ampliar las capacidades del agente con herramientas y servicios externos. +- **Skills** — Framework de skills extensible con skills integradas para planificación de tareas, investigación, revisión de código, operaciones de git y más. +- **Multiplataforma** — Soporte completo para Windows, macOS y Linux con variantes de código específicas por plataforma y contenedorización con Docker. + +> [!IMPORTANT] +> **Nota sobre el modo GUI:** El modo GUI aún se encuentra en fase experimental. Esto significa que es posible que encuentres problemas cuando el agente cambie a modo GUI. Estamos mejorando esta función activamente. + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 Primeros pasos + +### Requisitos previos +- Python **3.10+** +- `git` (necesario para clonar el repositorio) +- Una clave API del proveedor de LLM que elijas (OpenAI, Gemini o Anthropic) +- `Node.js` **18+** (opcional — solo necesario para la interfaz del navegador) +- `conda` (opcional — si no se encuentra, el instalador ofrece instalar Miniconda automáticamente) + +### Instalación rápida + +```bash +# Clona el repositorio +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# Instala las dependencias +python install.py + +# Ejecuta el agente +python run.py +``` + +¡Eso es todo! La primera ejecución te guiará en la configuración de tus claves API. + +**Nota:** Si no tienes Node.js instalado, el instalador te guiará con instrucciones paso a paso. También puedes omitir el modo navegador y usar TUI en su lugar (ver los modos a continuación). + +### ¿Qué puedes hacer justo después? +- Hablar con el agente de forma natural +- Pedirle que realice tareas complejas de varios pasos +- Escribir `/help` para ver los comandos disponibles +- Conectarte a Google, Slack, Notion y más + +### 🖥️ Modos de interfaz + +
+ CraftOS Banner +
+ +CraftBot soporta varios modos de UI. Elige según tu preferencia: + +| Modo | Comando | Requisitos | Recomendado para | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | Interfaz web moderna, la más sencilla de usar | +| **TUI** | `python run.py --tui` | Ninguno | UI en terminal, sin dependencias adicionales | +| **CLI** | `python run.py --cli` | Ninguno | Línea de comandos, ligero | +| **GUI** | `python run.py --gui` | `install.py --gui` | Automatización de escritorio con feedback visual | + +El **modo navegador** es el predeterminado y recomendado. Si no tienes Node.js, el instalador te ofrecerá instrucciones de instalación o puedes usar el **modo TUI** en su lugar. + +--- + +## 🧩 Visión general de la arquitectura + +| Componente | Descripción | +|-----------|-------------| +| **Agent Base** | Capa de orquestación central que gestiona el ciclo de vida de las tareas, coordina los componentes y maneja el bucle agente principal. | +| **LLM Interface** | Interfaz unificada que soporta múltiples proveedores LLM (OpenAI, Gemini, Anthropic, BytePlus, Ollama). | +| **Context Engine** | Genera prompts optimizados con soporte de KV-cache. | +| **Action Manager** | Recupera y ejecuta acciones desde la biblioteca. Las acciones personalizadas son fáciles de extender. | +| **Action Router** | Selecciona inteligentemente la acción que mejor se ajusta a los requisitos de la tarea y resuelve los parámetros de entrada mediante el LLM cuando es necesario. | +| **Event Stream** | Sistema de publicación de eventos en tiempo real para seguimiento del progreso de tareas, actualizaciones de UI y monitoreo de ejecución. | +| **Memory Manager** | Memoria semántica basada en RAG con ChromaDB. Gestiona fragmentación de memoria, embeddings, recuperación y actualizaciones incrementales. | +| **State Manager** | Gestión global del estado para rastrear el contexto de ejecución del agente, el historial de conversación y la configuración en tiempo de ejecución. | +| **Task Manager** | Administra definiciones de tareas, habilita modos de tareas simples y complejas, crea todos y hace seguimiento a flujos de trabajo multietapa. | +| **Skill Manager** | Carga e inyecta skills intercambiables en el contexto del agente. | +| **MCP Adapter** | Integración con Model Context Protocol que convierte herramientas MCP en acciones nativas. | +| **TUI Interface** | Interfaz de usuario de terminal construida con el framework Textual para operación interactiva por línea de comandos. | +| **GUI Module** | Automatización GUI experimental mediante contenedores Docker, OmniParser para detección de elementos de UI y cliente Gradio. | + +--- + +## 🔜 Hoja de ruta + +- [X] **Módulo de memoria** — Listo. +- [ ] **Integración con herramientas externas** — ¡Seguimos añadiendo más! +- [X] **Capa MCP** — Listo. +- [X] **Capa de Skills** — Listo. +- [X] **Comportamiento proactivo** — En curso + +--- + +## 🖥️ Modo GUI (opcional) + +El modo GUI permite la automatización de pantalla: el agente puede ver e interactuar con un entorno de escritorio. Esto es opcional y requiere configuración adicional. + +```bash +# Instalar con soporte GUI (usando pip, sin conda) +python install.py --gui + +# Instalar con soporte GUI y conda +python install.py --gui --conda + +# Ejecutar en modo GUI +python run.py --gui +``` + +> [!NOTE] +> El modo GUI es experimental y requiere dependencias adicionales (~4 GB para los pesos del modelo). Si no necesitas automatización de escritorio, omítelo y usa el modo Browser/TUI, que no tiene dependencias adicionales. + +--- + +## 📋 Referencia de comandos + +### install.py + +| Flag | Descripción | +|------|-------------| +| `--gui` | Instala componentes GUI (OmniParser) | +| `--conda` | Usa entorno conda (opcional) | +| `--cpu-only` | Instala PyTorch solo para CPU (con `--gui`) | + +### run.py + +| Flag | Descripción | +|------|-------------| +| (ninguno) | Ejecutar en modo **Browser** (recomendado, requiere Node.js) | +| `--tui` | Ejecutar en modo **Terminal UI** (no requiere dependencias) | +| `--cli` | Ejecutar en modo **CLI** (ligero) | +| `--gui` | Habilitar modo de automatización GUI (requiere `install.py --gui` previamente) | + +### service.py + +| Comando | Descripción | +|---------|-------------| +| `install` | Instala dependencias, registra el autoarranque e inicia CraftBot | +| `start` | Inicia CraftBot en segundo plano | +| `stop` | Detiene CraftBot | +| `restart` | Detener y luego iniciar | +| `status` | Muestra el estado de ejecución y del autoarranque | +| `logs [-n N]` | Muestra las últimas N líneas de log (por defecto: 50) | +| `uninstall` | Elimina el registro de autoarranque | + +**Ejemplos de instalación:** +```bash +# Instalación simple con pip (sin conda) +python install.py + +# Con soporte GUI (usando pip, sin conda) +python install.py --gui + +# Con soporte GUI en sistemas solo CPU (usando pip, sin conda) +python install.py --gui --cpu-only + +# Con entorno conda (recomendado para usuarios de conda) +python install.py --conda + +# Con soporte GUI y conda +python install.py --gui --conda + +# Con GUI en sistemas solo CPU con conda +python install.py --gui --conda --cpu-only +``` + +**Ejecución de CraftBot:** + +```powershell +# Modo navegador (por defecto, requiere Node.js) +python run.py + +# Modo TUI (no requiere Node.js) +python run.py --tui + +# Modo CLI (ligero) +python run.py --cli + +# Con modo GPU/GUI +python run.py --gui + +# Con entorno conda +conda run -n craftbot python run.py + +# O usando la ruta completa si conda no está en PATH +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS (Bash):** +```bash +# Modo navegador (por defecto, requiere Node.js) +python run.py + +# Modo TUI (no requiere Node.js) +python run.py --tui + +# Modo CLI (ligero) +python run.py --cli + +# Con modo GPU/GUI +python run.py --gui + +# Con entorno conda +conda run -n craftbot python run.py +``` + +### 🔧 Servicio en segundo plano (recomendado) + +Ejecuta CraftBot como un servicio en segundo plano para que siga funcionando incluso después de cerrar la terminal. Se crea automáticamente un acceso directo en el escritorio para reabrir el navegador cuando quieras. + +```bash +# Instala dependencias, registra autoarranque al iniciar sesión e inicia CraftBot +python service.py install +``` + +Eso es todo. La terminal se cierra sola, CraftBot se ejecuta en segundo plano y el navegador se abre automáticamente. + +```bash +# Otros comandos del servicio: +python service.py start # Inicia CraftBot en segundo plano +python service.py status # Comprueba si está en ejecución +python service.py stop # Detiene CraftBot +python service.py restart # Reinicia CraftBot +python service.py logs # Ver el log reciente +``` + +| Comando | Descripción | +|---------|-------------| +| `python service.py install` | Instala dependencias, registra autoarranque al iniciar sesión, inicia CraftBot, abre el navegador y cierra la terminal automáticamente | +| `python service.py start` | Inicia CraftBot en segundo plano — se reinicia automáticamente si ya está en ejecución (la terminal se cierra sola) | +| `python service.py stop` | Detiene CraftBot | +| `python service.py restart` | Detiene e inicia CraftBot | +| `python service.py status` | Comprueba si CraftBot está en ejecución y si el autoarranque está habilitado | +| `python service.py logs` | Muestra la salida reciente del log (`-n 100` para más líneas) | +| `python service.py uninstall` | Detiene CraftBot, elimina el registro de autoarranque, desinstala paquetes pip y purga la caché de pip | + +> [!TIP] +> Tras `service.py start` o `service.py install`, se crea automáticamente un **acceso directo de CraftBot en el escritorio**. Si cierras el navegador por error, haz doble clic en el acceso directo para reabrirlo. + +> [!NOTE] +> **Instalación:** El instalador ahora ofrece orientación clara si faltan dependencias. Si no se encuentra Node.js, se te pedirá instalarlo o podrás cambiar al modo TUI. La instalación detecta automáticamente la disponibilidad de GPU y recurre al modo solo CPU si es necesario. + +> [!TIP] +> **Configuración inicial:** CraftBot te guiará por una secuencia de onboarding para configurar claves API, el nombre del agente, MCPs y Skills. + +> [!NOTE] +> **Playwright Chromium:** Opcional para la integración con WhatsApp Web. Si falla la instalación, el agente seguirá funcionando para otras tareas. Puedes instalarlo manualmente más tarde con: `playwright install chromium` + +--- + +## � Solución de problemas y preguntas frecuentes + +### Falta Node.js (para el modo navegador) +Si ves **"npm not found in PATH"** al ejecutar `python run.py`: +1. Descárgalo desde [nodejs.org](https://nodejs.org/) (elige la versión LTS) +2. Instálalo y reinicia tu terminal +3. Ejecuta `python run.py` de nuevo + +**Alternativa:** Usa el modo TUI (no necesita Node.js): +```bash +python run.py --tui +``` + +### La instalación falla por dependencias +Ahora el instalador ofrece mensajes de error detallados con soluciones. Si la instalación falla: +- **Revisa la versión de Python:** asegúrate de tener Python 3.10+ (`python --version`) +- **Revisa tu conexión a Internet:** las dependencias se descargan durante la instalación +- **Limpia la caché de pip:** ejecuta `pip install --upgrade pip` e inténtalo de nuevo + +### Problemas de instalación de Playwright +La instalación de Playwright Chromium es opcional. Si falla: +- El agente **seguirá funcionando** para otras tareas +- Puedes omitirla o instalarla más tarde: `playwright install chromium` +- Solo es necesaria para la integración con WhatsApp Web + +### Problemas de GPU/CUDA +El instalador detecta automáticamente la disponibilidad de GPU: +- Si la instalación de CUDA falla, se pasa al modo CPU automáticamente +- Para configuración CPU manual: `python install.py --gui --cpu-only` + +Para una solución de problemas más detallada, consulta [INSTALLATION_FIX.md](INSTALLATION_FIX.md). + +--- + +El agente puede conectarse a varios servicios usando OAuth. Las builds de release incluyen credenciales integradas, pero también puedes usar las tuyas. + +### Inicio rápido + +Para builds de release con credenciales integradas: +``` +/google login # Conectar Google Workspace +/zoom login # Conectar Zoom +/slack invite # Conectar Slack +/notion invite # Conectar Notion +/linkedin login # Conectar LinkedIn +``` + +### Detalles de los servicios + +| Servicio | Tipo de auth | Comando | ¿Requiere secreto? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | No (PKCE) | +| Zoom | PKCE | `/zoom login` | No (PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | Sí | +| Notion | OAuth 2.0 | `/notion invite` | Sí | +| LinkedIn | OAuth 2.0 | `/linkedin login` | Sí | + +### Uso de tus propias credenciales + +Si prefieres usar tus propias credenciales OAuth, añádelas a tu archivo `.env`: + +#### Google (PKCE — solo se necesita el Client ID) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. Ve a [Google Cloud Console](https://console.cloud.google.com/) +2. Habilita las APIs de Gmail, Calendar, Drive y People +3. Crea credenciales OAuth de tipo **Desktop app** +4. Copia el Client ID (el secreto no es necesario con PKCE) + +#### Zoom (PKCE — solo se necesita el Client ID) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. Ve a [Zoom Marketplace](https://marketplace.zoom.us/) +2. Crea una app OAuth +3. Copia el Client ID + +#### Slack (requiere ambos) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. Ve a [Slack API](https://api.slack.com/apps) +2. Crea una nueva app +3. Añade los scopes OAuth: `chat:write`, `channels:read`, `users:read`, etc. +4. Copia el Client ID y el Client Secret + +#### Notion (requiere ambos) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. Ve a [Notion Developers](https://developers.notion.com/) +2. Crea una nueva integración (Public integration) +3. Copia el OAuth Client ID y el Secret + +#### LinkedIn (requiere ambos) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. Ve a [LinkedIn Developers](https://developer.linkedin.com/) +2. Crea una app +3. Añade los scopes OAuth 2.0 +4. Copia el Client ID y el Client Secret + +--- +## Ejecutar con contenedor + +La raíz del repositorio incluye una configuración Docker con Python 3.10, paquetes clave del sistema (incluido Tesseract para OCR) y todas las dependencias de Python definidas en `environment.yml`/`requirements.txt`, de modo que el agente pueda ejecutarse de forma consistente en entornos aislados. + +A continuación las instrucciones para ejecutar nuestro agente con contenedor. + +### Construir la imagen + +Desde la raíz del repositorio: + +```bash +docker build -t craftbot . +``` + +### Ejecutar el contenedor + +La imagen está configurada para lanzar el agente con `python -m app.main` por defecto. Para ejecutarlo de forma interactiva: + +```bash +docker run --rm -it craftbot +``` + +Si necesitas suministrar variables de entorno, pasa un archivo env (por ejemplo, basado en `.env.example`): + +```bash +docker run --rm -it --env-file .env craftbot +``` + +Monta cualquier directorio que deba persistir fuera del contenedor (como carpetas de datos o caché) usando `-v`, y ajusta los puertos u otras opciones según lo necesite tu despliegue. La imagen trae dependencias del sistema para OCR (`tesseract`), automatización de pantalla (`pyautogui`, `mss`, utilidades X11 y un framebuffer virtual) y clientes HTTP comunes, de modo que el agente pueda trabajar con archivos, APIs de red y automatización GUI dentro del contenedor. + +### Habilitar automatización GUI/pantalla + +Las acciones GUI (eventos de ratón/teclado, capturas) requieren un servidor X11. Puedes conectarte al display del host o ejecutar de forma headless con `xvfb`: + +* Usar el display del host (requiere Linux con X11): + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + Añade más montajes `-v` para cualquier carpeta en la que el agente deba leer/escribir. + +* Ejecutar en modo headless con un display virtual: + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +Por defecto, la imagen usa Python 3.10 y empaqueta las dependencias de Python de `environment.yml`/`requirements.txt`, así que `python -m app.main` funciona de entrada. + +--- + +## 🤝 Cómo contribuir + +¡Las PRs son bienvenidas! Consulta [CONTRIBUTING.md](CONTRIBUTING.md) para el flujo de trabajo (fork → rama desde `dev` → PR). Todas las pull requests pasan automáticamente por CI de lint + smoke-test. Si tienes preguntas o quieres una conversación más rápida, únete a nuestro [Discord](https://discord.gg/ZN9YHc37HG) o escríbenos a thamyikfoong(at)craftos.net. + +## 🧾 Licencia + +Este proyecto está licenciado bajo la [Licencia MIT](LICENSE). Eres libre de usar, alojar y monetizar este proyecto (debes dar crédito a este proyecto en caso de distribución y monetización). + +--- + +## ⭐ Agradecimientos + +Desarrollado y mantenido por [CraftOS](https://craftos.net/) y los contribuyentes [@zfoong](https://github.com/zfoong) y [@ahmad-ajmal](https://github.com/ahmad-ajmal). +Si **CraftBot** te resulta útil, ¡pon una ⭐ al repositorio y compártelo con otras personas! diff --git a/README.ja.md b/README.ja.md index 8d77cbb4..9bb2a40e 100644 --- a/README.ja.md +++ b/README.ja.md @@ -22,10 +22,12 @@
[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt

- English version here | 中文版README + English | 简体中文 | 繁體中文 | 한국어 | Español

## 🚀 概要 diff --git a/README.ko.md b/README.ko.md new file mode 100644 index 00000000..105f7622 --- /dev/null +++ b/README.ko.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 繁體中文 | Español +

+ +## 🚀 개요 +

+CraftBot은 당신의 기기 안에 상주하며 24시간 내내 당신을 위해 일하는 개인 AI 어시스턴트입니다. +

+ +CraftBot은 작업을 자율적으로 해석하고, 행동을 계획하며, 당신의 목표를 달성하기 위해 이를 실행합니다. +사용자의 선호도와 목표를 학습하여, 삶의 목표를 이루도록 작업을 계획하고 능동적으로 시작하는 것을 도와줍니다. +MCP, 스킬, 그리고 외부 앱 통합을 지원합니다. + +CraftBot이 당신의 명령을 기다리고 있습니다. 지금 나만의 CraftBot을 설정해 보세요. + +
+ CraftBot Overview +
+ +--- + +## ✨ 주요 기능 + +- **Bring Your Own Key (BYOK)** — OpenAI, Google Gemini, Anthropic Claude, BytePlus, 로컬 Ollama 모델을 지원하는 유연한 LLM 제공자 시스템. 제공자 간 손쉬운 전환이 가능합니다. +- **메모리 시스템** — 하루 동안 발생한 사건들을 자정에 정제하고 통합합니다. +- **능동형 에이전트(Proactive Agent)** — 사용자의 선호도, 습관, 인생 목표를 학습합니다. 그리고 (물론 승인을 받은 뒤) 계획을 수행하고 작업을 시작하여 삶을 개선하도록 도와줍니다. +- **외부 도구 통합** — Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord, Telegram과 연결됩니다(계속 추가 예정!). 내장된 자격 증명 및 OAuth가 지원됩니다. +- **MCP** — 외부 도구 및 서비스로 에이전트 기능을 확장하기 위한 Model Context Protocol 통합. +- **스킬(Skills)** — 작업 계획, 리서치, 코드 리뷰, Git 작업 등 내장 스킬을 갖춘 확장형 스킬 프레임워크. +- **크로스 플랫폼** — 플랫폼별 코드 변형 및 Docker 컨테이너화를 통해 Windows, macOS, Linux를 완벽하게 지원합니다. + +> [!IMPORTANT] +> **GUI 모드 안내:** GUI 모드는 아직 실험 단계입니다. 이로 인해 에이전트가 GUI 모드로 전환할 때 문제가 발생할 수 있습니다. 현재 활발히 개선 중입니다. + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 시작하기 + +### 필수 요구 사항 +- Python **3.10+** +- `git` (리포지토리 클론 시 필요) +- 사용할 LLM 제공자의 API 키(OpenAI, Gemini 또는 Anthropic) +- `Node.js` **18+** (선택 사항 - 브라우저 인터페이스 사용 시에만 필요) +- `conda` (선택 사항 - 없을 경우 설치 프로그램이 Miniconda 자동 설치를 제안합니다) + +### 빠른 설치 + +```bash +# 리포지토리 클론 +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# 의존성 설치 +python install.py + +# 에이전트 실행 +python run.py +``` + +이게 전부입니다! 첫 실행 시 API 키 설정 과정을 안내해 줍니다. + +**참고:** Node.js가 설치되어 있지 않다면 설치 프로그램이 단계별로 안내해 드립니다. 브라우저 모드를 건너뛰고 TUI를 사용할 수도 있습니다(아래 모드 참고). + +### 바로 할 수 있는 일 +- 에이전트와 자연스럽게 대화 +- 복잡한 다단계 작업 요청 +- `/help`를 입력해 사용 가능한 명령 확인 +- Google, Slack, Notion 등과 연결 + +### 🖥️ 인터페이스 모드 + +
+ CraftOS Banner +
+ +CraftBot은 여러 UI 모드를 지원합니다. 선호에 따라 선택하세요. + +| 모드 | 명령어 | 요구 사항 | 적합한 용도 | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | 최신 웹 인터페이스, 가장 사용하기 쉬움 | +| **TUI** | `python run.py --tui` | 없음 | 터미널 UI, 별도 의존성 불필요 | +| **CLI** | `python run.py --cli` | 없음 | 커맨드라인, 경량 | +| **GUI** | `python run.py --gui` | `install.py --gui` | 시각적 피드백이 있는 데스크톱 자동화 | + +**브라우저 모드**가 기본이자 권장 모드입니다. Node.js가 없는 경우 설치 프로그램이 설치 안내를 제공하거나, 대신 **TUI 모드**를 사용할 수 있습니다. + +--- + +## 🧩 아키텍처 개요 + +| 구성 요소 | 설명 | +|-----------|-------------| +| **Agent Base** | 작업 라이프사이클을 관리하고 구성 요소 간 조정을 담당하며 주요 에이전틱 루프를 처리하는 핵심 오케스트레이션 계층. | +| **LLM Interface** | 여러 LLM 제공자(OpenAI, Gemini, Anthropic, BytePlus, Ollama)를 지원하는 통합 인터페이스. | +| **Context Engine** | KV 캐시를 지원하는 최적화된 프롬프트를 생성합니다. | +| **Action Manager** | 라이브러리에서 액션을 가져와 실행합니다. 커스텀 액션을 쉽게 확장할 수 있습니다. | +| **Action Router** | 작업 요구 사항에 가장 잘 맞는 액션을 지능적으로 선택하고, 필요 시 LLM을 통해 입력 매개변수를 해결합니다. | +| **Event Stream** | 작업 진행 추적, UI 업데이트, 실행 모니터링을 위한 실시간 이벤트 게시 시스템. | +| **Memory Manager** | ChromaDB 기반의 RAG 시맨틱 메모리. 메모리 청킹, 임베딩, 검색, 점진적 업데이트를 처리합니다. | +| **State Manager** | 에이전트 실행 컨텍스트, 대화 이력, 런타임 구성을 추적하는 전역 상태 관리. | +| **Task Manager** | 작업 정의를 관리하며 단순/복잡 작업 모드, 할 일 생성, 다단계 워크플로우 추적을 가능하게 합니다. | +| **Skill Manager** | 플러그형 스킬을 로드하여 에이전트 컨텍스트에 주입합니다. | +| **MCP Adapter** | MCP 도구를 네이티브 액션으로 변환하는 Model Context Protocol 통합. | +| **TUI Interface** | 대화형 커맨드라인 조작을 위해 Textual 프레임워크로 구축된 터미널 사용자 인터페이스. | +| **GUI Module** | Docker 컨테이너, UI 요소 감지를 위한 OmniParser, Gradio 클라이언트를 사용한 실험적 GUI 자동화. | + +--- + +## 🔜 로드맵 + +- [X] **메모리 모듈** — 완료. +- [ ] **외부 도구 통합** — 계속 추가 중! +- [X] **MCP 레이어** — 완료. +- [X] **스킬 레이어** — 완료. +- [X] **능동형 동작(Proactive Behaviour)** — 진행 중 + +--- + +## 🖥️ GUI 모드 (선택 사항) + +GUI 모드는 화면 자동화를 지원합니다 — 에이전트가 데스크톱 환경을 보고 상호작용할 수 있습니다. 선택 사항이며 추가 설정이 필요합니다. + +```bash +# GUI 지원 설치 (pip 사용, conda 불필요) +python install.py --gui + +# GUI 지원과 conda를 함께 사용 +python install.py --gui --conda + +# GUI 모드로 실행 +python run.py --gui +``` + +> [!NOTE] +> GUI 모드는 실험적이며 추가 의존성(모델 가중치로 약 4GB)이 필요합니다. 데스크톱 자동화가 필요하지 않다면 이를 건너뛰고 추가 의존성이 없는 Browser/TUI 모드를 사용하세요. + +--- + +## 📋 명령어 레퍼런스 + +### install.py + +| 플래그 | 설명 | +|------|-------------| +| `--gui` | GUI 구성 요소(OmniParser) 설치 | +| `--conda` | conda 환경 사용 (선택 사항) | +| `--cpu-only` | CPU 전용 PyTorch 설치 (`--gui`와 함께 사용) | + +### run.py + +| 플래그 | 설명 | +|------|-------------| +| (없음) | **Browser** 모드로 실행 (권장, Node.js 필요) | +| `--tui` | **터미널 UI** 모드로 실행 (의존성 불필요) | +| `--cli` | **CLI** 모드로 실행 (경량) | +| `--gui` | GUI 자동화 모드 활성화 (`install.py --gui` 선행 필요) | + +### service.py + +| 명령 | 설명 | +|---------|-------------| +| `install` | 의존성 설치, 자동 시작 등록, CraftBot 실행 | +| `start` | CraftBot을 백그라운드에서 실행 | +| `stop` | CraftBot 중지 | +| `restart` | 중지 후 다시 시작 | +| `status` | 실행 상태 및 자동 시작 상태 표시 | +| `logs [-n N]` | 마지막 N개의 로그 라인 표시 (기본값: 50) | +| `uninstall` | 자동 시작 등록 해제 | + +**설치 예시:** +```bash +# 간단한 pip 설치 (conda 미사용) +python install.py + +# GUI 지원 설치 (pip, conda 미사용) +python install.py --gui + +# CPU 전용 시스템에서 GUI 지원 설치 (pip, conda 미사용) +python install.py --gui --cpu-only + +# conda 환경 사용 (conda 사용자에게 권장) +python install.py --conda + +# GUI 지원과 conda +python install.py --gui --conda + +# CPU 전용 시스템에서 GUI와 conda 함께 +python install.py --gui --conda --cpu-only +``` + +**CraftBot 실행:** + +```powershell +# Browser 모드 (기본, Node.js 필요) +python run.py + +# TUI 모드 (Node.js 불필요) +python run.py --tui + +# CLI 모드 (경량) +python run.py --cli + +# GPU/GUI 모드 +python run.py --gui + +# conda 환경에서 실행 +conda run -n craftbot python run.py + +# conda가 PATH에 없는 경우 전체 경로 사용 +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS (Bash):** +```bash +# Browser 모드 (기본, Node.js 필요) +python run.py + +# TUI 모드 (Node.js 불필요) +python run.py --tui + +# CLI 모드 (경량) +python run.py --cli + +# GPU/GUI 모드 +python run.py --gui + +# conda 환경에서 실행 +conda run -n craftbot python run.py +``` + +### 🔧 백그라운드 서비스 (권장) + +터미널을 닫아도 CraftBot이 계속 실행되도록 백그라운드 서비스로 실행합니다. 데스크톱 바로가기가 자동으로 생성되므로 언제든지 브라우저를 다시 열 수 있습니다. + +```bash +# 의존성 설치, 로그인 시 자동 시작 등록, CraftBot 실행 +python service.py install +``` + +이게 전부입니다. 터미널은 자동으로 닫히고, CraftBot은 백그라운드에서 실행되며, 브라우저가 자동으로 열립니다. + +```bash +# 기타 서비스 명령: +python service.py start # CraftBot을 백그라운드에서 시작 +python service.py status # 실행 여부 확인 +python service.py stop # CraftBot 중지 +python service.py restart # CraftBot 재시작 +python service.py logs # 최근 로그 출력 확인 +``` + +| 명령 | 설명 | +|---------|-------------| +| `python service.py install` | 의존성 설치, 로그인 시 자동 시작 등록, CraftBot 실행, 브라우저 열기 후 터미널 자동 종료 | +| `python service.py start` | CraftBot을 백그라운드에서 시작 — 이미 실행 중이면 자동 재시작 (터미널 자동 종료) | +| `python service.py stop` | CraftBot 중지 | +| `python service.py restart` | CraftBot 중지 후 재시작 | +| `python service.py status` | CraftBot 실행 여부와 자동 시작 활성화 여부 확인 | +| `python service.py logs` | 최근 로그 출력 표시 (`-n 100`으로 더 많은 줄 표시) | +| `python service.py uninstall` | CraftBot 중지, 자동 시작 등록 해제, pip 패키지 제거 및 pip 캐시 정리 | + +> [!TIP] +> `service.py start` 또는 `service.py install` 실행 후 **CraftBot 데스크톱 바로가기**가 자동으로 생성됩니다. 브라우저를 실수로 닫았다면 바로가기를 더블클릭해 다시 열 수 있습니다. + +> [!NOTE] +> **설치:** 의존성이 누락된 경우 설치 프로그램이 명확한 안내를 제공합니다. Node.js가 없으면 설치 여부를 묻거나 TUI 모드로 전환할 수 있습니다. GPU 가용성을 자동으로 감지하고 필요한 경우 CPU 전용 모드로 대체합니다. + +> [!TIP] +> **첫 실행 설정:** CraftBot은 API 키, 에이전트 이름, MCP, 스킬 설정을 위한 온보딩 과정을 안내합니다. + +> [!NOTE] +> **Playwright Chromium:** WhatsApp Web 통합에 필요한 선택 사항입니다. 설치에 실패해도 다른 작업에서는 에이전트가 정상 작동합니다. 나중에 `playwright install chromium`으로 수동 설치할 수 있습니다. + +--- + +## � 문제 해결 및 자주 발생하는 이슈 + +### Node.js 누락 (브라우저 모드용) +`python run.py` 실행 시 **"npm not found in PATH"** 오류가 보인다면: +1. [nodejs.org](https://nodejs.org/)에서 다운로드 (LTS 버전 권장) +2. 설치 후 터미널 재시작 +3. `python run.py`를 다시 실행 + +**대안:** TUI 모드를 사용하세요 (Node.js 불필요): +```bash +python run.py --tui +``` + +### 의존성 설치 실패 +설치 프로그램은 이제 해결 방법이 포함된 자세한 오류 메시지를 제공합니다. 설치가 실패한다면: +- **Python 버전 확인:** Python 3.10+인지 확인 (`python --version`) +- **인터넷 연결 확인:** 설치 중 의존성이 다운로드됩니다 +- **pip 캐시 초기화:** `pip install --upgrade pip` 후 다시 시도 + +### Playwright 설치 문제 +Playwright chromium 설치는 선택 사항입니다. 실패 시: +- 에이전트는 다른 작업에서 **정상 작동**합니다 +- 건너뛰거나 나중에 설치 가능: `playwright install chromium` +- WhatsApp Web 통합에만 필요합니다 + +### GPU/CUDA 문제 +설치 프로그램은 GPU 가용성을 자동으로 감지합니다: +- CUDA 설치가 실패하면 자동으로 CPU 모드로 대체됩니다 +- CPU 수동 설정: `python install.py --gui --cpu-only` + +자세한 문제 해결은 [INSTALLATION_FIX.md](INSTALLATION_FIX.md)를 참고하세요. + +--- + +에이전트는 OAuth를 사용해 다양한 서비스에 연결할 수 있습니다. 릴리스 빌드에는 자격 증명이 내장되어 있지만, 자신의 자격 증명을 사용할 수도 있습니다. + +### 빠른 시작 + +자격 증명이 내장된 릴리스 빌드의 경우: +``` +/google login # Google Workspace 연결 +/zoom login # Zoom 연결 +/slack invite # Slack 연결 +/notion invite # Notion 연결 +/linkedin login # LinkedIn 연결 +``` + +### 서비스 세부 정보 + +| 서비스 | 인증 유형 | 명령 | 시크릿 필요? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | 불필요 (PKCE) | +| Zoom | PKCE | `/zoom login` | 불필요 (PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | 필요 | +| Notion | OAuth 2.0 | `/notion invite` | 필요 | +| LinkedIn | OAuth 2.0 | `/linkedin login` | 필요 | + +### 자신의 자격 증명 사용하기 + +자체 OAuth 자격 증명을 사용하려면 `.env` 파일에 추가하세요. + +#### Google (PKCE - Client ID만 필요) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. [Google Cloud Console](https://console.cloud.google.com/) 접속 +2. Gmail, Calendar, Drive, People API 활성화 +3. **데스크톱 앱** 유형으로 OAuth 자격 증명 생성 +4. Client ID 복사 (PKCE에서는 시크릿 불필요) + +#### Zoom (PKCE - Client ID만 필요) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. [Zoom Marketplace](https://marketplace.zoom.us/) 접속 +2. OAuth 앱 생성 +3. Client ID 복사 + +#### Slack (둘 다 필요) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. [Slack API](https://api.slack.com/apps) 접속 +2. 새 앱 생성 +3. OAuth 스코프 추가: `chat:write`, `channels:read`, `users:read` 등 +4. Client ID와 Client Secret 복사 + +#### Notion (둘 다 필요) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. [Notion Developers](https://developers.notion.com/) 접속 +2. 새 통합 생성 (Public integration) +3. OAuth Client ID와 Secret 복사 + +#### LinkedIn (둘 다 필요) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. [LinkedIn Developers](https://developer.linkedin.com/) 접속 +2. 앱 생성 +3. OAuth 2.0 스코프 추가 +4. Client ID와 Client Secret 복사 + +--- +## 컨테이너로 실행하기 + +리포지토리 루트에는 Python 3.10, OCR을 위한 Tesseract를 포함한 주요 시스템 패키지, 그리고 `environment.yml`/`requirements.txt`에 정의된 모든 Python 의존성을 갖춘 Docker 구성이 포함되어 있습니다. 이를 통해 격리된 환경에서 에이전트를 일관되게 실행할 수 있습니다. + +아래는 컨테이너로 에이전트를 실행하는 설정 방법입니다. + +### 이미지 빌드 + +리포지토리 루트에서: + +```bash +docker build -t craftbot . +``` + +### 컨테이너 실행 + +이 이미지는 기본적으로 `python -m app.main`으로 에이전트를 실행하도록 구성되어 있습니다. 대화형으로 실행하려면: + +```bash +docker run --rm -it craftbot +``` + +환경 변수를 제공하려면 env 파일을 전달하세요 (예: `.env.example` 기반): + +```bash +docker run --rm -it --env-file .env craftbot +``` + +컨테이너 외부에 유지해야 하는 디렉터리(데이터, 캐시 폴더 등)는 `-v`를 사용해 마운트하고, 배포 환경에 맞게 포트나 추가 플래그를 조정하세요. 이미지에는 OCR(`tesseract`), 화면 자동화(`pyautogui`, `mss`, X11 유틸리티, 가상 프레임버퍼), 일반 HTTP 클라이언트 등의 시스템 의존성이 포함되어 있어 컨테이너 내에서 파일, 네트워크 API, GUI 자동화를 모두 처리할 수 있습니다. + +### GUI/화면 자동화 활성화 + +GUI 작업(마우스/키보드 이벤트, 스크린샷)은 X11 서버가 필요합니다. 호스트 디스플레이에 연결하거나 `xvfb`를 사용해 헤드리스로 실행할 수 있습니다. + +* 호스트 디스플레이 사용 (X11이 있는 Linux 필요): + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + 에이전트가 읽거나 쓸 폴더에 대해 추가 `-v` 마운트를 더해주세요. + +* 가상 디스플레이로 헤드리스 실행: + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +기본적으로 이미지는 Python 3.10을 사용하고 `environment.yml`/`requirements.txt`의 Python 의존성을 번들로 포함하므로 `python -m app.main`이 바로 동작합니다. + +--- + +## 🤝 기여 방법 + +PR을 환영합니다! 워크플로우(포크 → `dev`에서 브랜치 생성 → PR)는 [CONTRIBUTING.md](CONTRIBUTING.md)를 참고하세요. 모든 풀 리퀘스트는 린트 + 스모크 테스트 CI를 자동으로 거칩니다. 질문이 있거나 더 빠른 대화를 원하시면 [Discord](https://discord.gg/ZN9YHc37HG)에 참여하거나 thamyikfoong(at)craftos.net로 이메일을 보내주세요. + +## 🧾 라이선스 + +이 프로젝트는 [MIT 라이선스](LICENSE)로 배포됩니다. 이 프로젝트를 자유롭게 사용, 호스팅, 수익화할 수 있습니다(배포 및 수익화 시 이 프로젝트를 크레딧으로 명시해야 합니다). + +--- + +## ⭐ 감사의 말 + +[CraftOS](https://craftos.net/)와 기여자 [@zfoong](https://github.com/zfoong), [@ahmad-ajmal](https://github.com/ahmad-ajmal)이 개발 및 유지 관리하고 있습니다. +**CraftBot**이 유용하다고 느끼신다면 리포지토리에 ⭐를 눌러주시고 다른 분들에게도 공유해 주세요! diff --git a/README.md b/README.md index 666b19f9..69087a55 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,12 @@
[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt

- 日本語版はこちら | 中文版README + 日本語 | 简体中文 | 繁體中文 | 한국어 | Español

## 🚀 Overview diff --git a/README.zh-TW.md b/README.zh-TW.md new file mode 100644 index 00000000..31e87799 --- /dev/null +++ b/README.zh-TW.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 한국어 | Español +

+ +## 🚀 概覽 +

+CraftBot 是你的個人 AI 助理,它駐留在你的裝置中,全天候為你服務。 +

+ +它會自主解讀任務、規劃行動並執行它們,協助你達成目標。 +它會學習你的偏好與目標,主動協助你規劃並展開任務,幫助你實現人生目標。 +支援 MCP、技能(Skills)以及外部應用整合。 + +CraftBot 正在等待你的指令,立刻建立屬於你自己的 CraftBot 吧。 + +
+ CraftBot Overview +
+ +--- + +## ✨ 功能特色 + +- **自帶金鑰(BYOK)** — 靈活的 LLM 供應商系統,支援 OpenAI、Google Gemini、Anthropic Claude、BytePlus 及本地 Ollama 模型,可輕鬆切換。 +- **記憶系統** — 每天午夜時分提煉並整合當日所發生的事件。 +- **主動式代理人** — 學習你的偏好、習慣與人生目標,接著進行規劃並(在取得同意後)主動啟動任務,協助你在生活中不斷進步。 +- **外部工具整合** — 連接 Google Workspace、Slack、Notion、Zoom、LinkedIn、Discord 及 Telegram(更多服務陸續推出!),內建憑證與 OAuth 支援。 +- **MCP** — 整合 Model Context Protocol,以外部工具與服務擴充代理人的能力。 +- **技能(Skills)** — 可擴充的技能框架,內建任務規劃、研究、程式碼審查、Git 操作等多種技能。 +- **跨平台** — 完整支援 Windows、macOS 與 Linux,並提供對應的平台程式碼與 Docker 容器化。 + +> [!IMPORTANT] +> **關於 GUI 模式:** GUI 模式目前仍處於實驗階段。當代理人切換到 GUI 模式時可能會遇到問題。我們正在持續改進此功能。 + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 快速開始 + +### 先決條件 +- Python **3.10+** +- `git`(複製儲存庫時需要) +- 所選 LLM 供應商的 API 金鑰(OpenAI、Gemini 或 Anthropic) +- `Node.js` **18+**(選用——僅於使用瀏覽器介面時需要) +- `conda`(選用——若未安裝,安裝程式可代為安裝 Miniconda) + +### 快速安裝 + +```bash +# 複製儲存庫 +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# 安裝相依套件 +python install.py + +# 執行代理人 +python run.py +``` + +這樣就完成了!首次執行時會引導你設定 API 金鑰。 + +**注意:** 若尚未安裝 Node.js,安裝程式會提供逐步指引。你也可以跳過瀏覽器模式,改用 TUI(請見下方模式說明)。 + +### 立即能做什麼? +- 用自然語言與代理人對話 +- 請它執行複雜的多步驟任務 +- 輸入 `/help` 查看可用指令 +- 連接 Google、Slack、Notion 等服務 + +### 🖥️ 介面模式 + +
+ CraftOS Banner +
+ +CraftBot 支援多種 UI 模式,可依個人偏好選擇: + +| 模式 | 指令 | 需求 | 適用情境 | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | 現代化網頁介面,最易使用 | +| **TUI** | `python run.py --tui` | 無 | 終端機 UI,無須額外相依套件 | +| **CLI** | `python run.py --cli` | 無 | 命令列,輕量化 | +| **GUI** | `python run.py --gui` | `install.py --gui` | 帶視覺回饋的桌面自動化 | + +**Browser 模式**為預設與建議選項。若沒有 Node.js,安裝程式會提供安裝指引,或你可改用 **TUI 模式**。 + +--- + +## 🧩 架構概覽 + +| 元件 | 說明 | +|-----------|-------------| +| **Agent Base** | 負責管理任務生命週期、協調各元件並處理主要代理人迴圈的核心編排層。 | +| **LLM Interface** | 支援多家 LLM 供應商(OpenAI、Gemini、Anthropic、BytePlus、Ollama)的統一介面。 | +| **Context Engine** | 產生最佳化的 Prompt,支援 KV-Cache。 | +| **Action Manager** | 從動作庫中擷取並執行動作,方便擴充自訂動作。 | +| **Action Router** | 依任務需求智慧挑選最合適的動作,並在需要時透過 LLM 解析輸入參數。 | +| **Event Stream** | 即時事件發佈系統,用於任務進度追蹤、UI 更新與執行監控。 | +| **Memory Manager** | 以 ChromaDB 為基礎的 RAG 語意記憶,處理記憶分塊、嵌入、檢索與增量更新。 | +| **State Manager** | 全域狀態管理,追蹤代理人執行脈絡、對話歷史與執行期設定。 | +| **Task Manager** | 管理任務定義,支援簡單與複雜任務模式、待辦清單建立,以及多步驟流程追蹤。 | +| **Skill Manager** | 載入並將可插拔技能注入到代理人情境中。 | +| **MCP Adapter** | Model Context Protocol 整合,將 MCP 工具轉換為原生動作。 | +| **TUI Interface** | 以 Textual 框架打造的終端機使用者介面,提供互動式命令列操作。 | +| **GUI Module** | 實驗性的 GUI 自動化,採用 Docker 容器、OmniParser(用於 UI 元素偵測)與 Gradio 用戶端。 | + +--- + +## 🔜 藍圖 + +- [X] **記憶模組** — 完成。 +- [ ] **外部工具整合** — 仍在持續新增! +- [X] **MCP 層** — 完成。 +- [X] **技能層** — 完成。 +- [X] **主動式行為** — 進行中 + +--- + +## 🖥️ GUI 模式(選用) + +GUI 模式可啟用螢幕自動化——代理人能看見桌面並與其互動。此為選用功能,需要額外安裝。 + +```bash +# 安裝 GUI 支援(使用 pip,不需 conda) +python install.py --gui + +# 安裝 GUI 支援並搭配 conda +python install.py --gui --conda + +# 以 GUI 模式執行 +python run.py --gui +``` + +> [!NOTE] +> GUI 模式屬於實驗性,需要額外相依套件(模型權重約 4GB)。若不需桌面自動化,請改用沒有額外相依套件的 Browser/TUI 模式。 + +--- + +## 📋 指令參考 + +### install.py + +| 旗標 | 說明 | +|------|-------------| +| `--gui` | 安裝 GUI 元件(OmniParser) | +| `--conda` | 使用 conda 環境(選用) | +| `--cpu-only` | 僅安裝 CPU 版 PyTorch(需搭配 `--gui`) | + +### run.py + +| 旗標 | 說明 | +|------|-------------| +| (無) | 以 **Browser** 模式執行(建議,需 Node.js) | +| `--tui` | 以 **Terminal UI** 模式執行(無需額外相依) | +| `--cli` | 以 **CLI** 模式執行(輕量) | +| `--gui` | 啟用 GUI 自動化模式(需先執行 `install.py --gui`) | + +### service.py + +| 指令 | 說明 | +|---------|-------------| +| `install` | 安裝相依套件、註冊開機自動啟動,並啟動 CraftBot | +| `start` | 在背景啟動 CraftBot | +| `stop` | 停止 CraftBot | +| `restart` | 停止後重新啟動 | +| `status` | 顯示執行狀態與自動啟動狀態 | +| `logs [-n N]` | 顯示最後 N 行記錄(預設 50) | +| `uninstall` | 移除自動啟動註冊 | + +**安裝範例:** +```bash +# 單純使用 pip 安裝(不使用 conda) +python install.py + +# 安裝 GUI 支援(使用 pip,不使用 conda) +python install.py --gui + +# 於僅 CPU 的系統安裝 GUI 支援(使用 pip,不使用 conda) +python install.py --gui --cpu-only + +# 使用 conda 環境(建議給 conda 使用者) +python install.py --conda + +# 同時啟用 GUI 與 conda +python install.py --gui --conda + +# 於僅 CPU 的系統使用 GUI 及 conda +python install.py --gui --conda --cpu-only +``` + +**執行 CraftBot:** + +```powershell +# Browser 模式(預設,需 Node.js) +python run.py + +# TUI 模式(無需 Node.js) +python run.py --tui + +# CLI 模式(輕量) +python run.py --cli + +# GPU/GUI 模式 +python run.py --gui + +# 使用 conda 環境 +conda run -n craftbot python run.py + +# 若 conda 不在 PATH,使用完整路徑 +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS(Bash):** +```bash +# Browser 模式(預設,需 Node.js) +python run.py + +# TUI 模式(無需 Node.js) +python run.py --tui + +# CLI 模式(輕量) +python run.py --cli + +# GPU/GUI 模式 +python run.py --gui + +# 使用 conda 環境 +conda run -n craftbot python run.py +``` + +### 🔧 背景服務(建議) + +將 CraftBot 當成背景服務執行,即使關閉終端機仍能持續運作。系統會自動建立桌面捷徑,讓你隨時可重新開啟瀏覽器。 + +```bash +# 安裝相依套件、註冊登入時自動啟動並啟動 CraftBot +python service.py install +``` + +這樣就完成了。終端機會自動關閉,CraftBot 在背景執行,瀏覽器也會自動開啟。 + +```bash +# 其他服務指令: +python service.py start # 在背景啟動 CraftBot +python service.py status # 檢查是否正在執行 +python service.py stop # 停止 CraftBot +python service.py restart # 重新啟動 CraftBot +python service.py logs # 檢視最近的記錄 +``` + +| 指令 | 說明 | +|---------|-------------| +| `python service.py install` | 安裝相依套件、註冊登入時自動啟動、啟動 CraftBot、開啟瀏覽器並自動關閉終端機 | +| `python service.py start` | 在背景啟動 CraftBot——若已在執行,會自動重啟(終端機自動關閉) | +| `python service.py stop` | 停止 CraftBot | +| `python service.py restart` | 停止並重新啟動 CraftBot | +| `python service.py status` | 檢查 CraftBot 是否執行中,以及自動啟動是否啟用 | +| `python service.py logs` | 顯示最近的記錄(使用 `-n 100` 顯示更多行) | +| `python service.py uninstall` | 停止 CraftBot、移除自動啟動註冊、解除 pip 套件並清除 pip 快取 | + +> [!TIP] +> 執行 `service.py start` 或 `service.py install` 後,會自動建立 **CraftBot 桌面捷徑**。若不小心關閉了瀏覽器,雙擊捷徑即可重新開啟。 + +> [!NOTE] +> **安裝:** 若相依套件缺失,安裝程式會提供清楚的指引。若找不到 Node.js,會提示你安裝或切換至 TUI 模式。安裝程式會自動偵測 GPU 是否可用,必要時會自動回退至 CPU 模式。 + +> [!TIP] +> **首次設定:** CraftBot 會引導你完成初始化流程,包含設定 API 金鑰、代理人名稱、MCP 與技能。 + +> [!NOTE] +> **Playwright Chromium:** 整合 WhatsApp Web 時選用。若安裝失敗,代理人仍可正常執行其他任務。稍後可以手動安裝:`playwright install chromium`。 + +--- + +## � 疑難排解與常見問題 + +### 缺少 Node.js(Browser 模式) +若執行 `python run.py` 時看到 **"npm not found in PATH"**: +1. 從 [nodejs.org](https://nodejs.org/) 下載(建議 LTS 版本) +2. 安裝完成後重新啟動終端機 +3. 再次執行 `python run.py` + +**替代方案:** 改用 TUI 模式(不需 Node.js): +```bash +python run.py --tui +``` + +### 相依套件安裝失敗 +安裝程式現在會提供詳細錯誤訊息及解決方案。若安裝失敗: +- **確認 Python 版本:** 確保安裝 Python 3.10+(`python --version`) +- **檢查網路連線:** 安裝過程需下載相依套件 +- **清除 pip 快取:** 執行 `pip install --upgrade pip` 後再試 + +### Playwright 安裝問題 +Playwright chromium 為選用安裝,若失敗: +- 代理人的其他功能**仍可正常運作** +- 可先跳過,日後再安裝:`playwright install chromium` +- 僅於整合 WhatsApp Web 時需要 + +### GPU/CUDA 問題 +安裝程式會自動偵測 GPU: +- CUDA 安裝失敗時會自動切換至 CPU 模式 +- 手動 CPU 安裝:`python install.py --gui --cpu-only` + +更多疑難排解請參閱 [INSTALLATION_FIX.md](INSTALLATION_FIX.md)。 + +--- + +代理人可透過 OAuth 連接多種服務。Release 版本內建憑證,但你也可以使用自己的憑證。 + +### 快速上手 + +若使用內建憑證的 Release 版本: +``` +/google login # 連接 Google Workspace +/zoom login # 連接 Zoom +/slack invite # 連接 Slack +/notion invite # 連接 Notion +/linkedin login # 連接 LinkedIn +``` + +### 服務細節 + +| 服務 | 驗證方式 | 指令 | 是否需要密鑰? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | 否(PKCE) | +| Zoom | PKCE | `/zoom login` | 否(PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | 是 | +| Notion | OAuth 2.0 | `/notion invite` | 是 | +| LinkedIn | OAuth 2.0 | `/linkedin login` | 是 | + +### 使用自己的憑證 + +若希望使用自己的 OAuth 憑證,請將其加入 `.env` 檔: + +#### Google(PKCE,只需 Client ID) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. 前往 [Google Cloud Console](https://console.cloud.google.com/) +2. 啟用 Gmail、Calendar、Drive 與 People API +3. 建立 OAuth 憑證,類型選 **Desktop app** +4. 複製 Client ID(PKCE 不需 secret) + +#### Zoom(PKCE,只需 Client ID) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. 前往 [Zoom Marketplace](https://marketplace.zoom.us/) +2. 建立 OAuth 應用程式 +3. 複製 Client ID + +#### Slack(兩者皆需) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. 前往 [Slack API](https://api.slack.com/apps) +2. 建立新應用程式 +3. 新增 OAuth 範圍:`chat:write`、`channels:read`、`users:read` 等 +4. 複製 Client ID 與 Client Secret + +#### Notion(兩者皆需) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. 前往 [Notion Developers](https://developers.notion.com/) +2. 建立新的整合(Public integration) +3. 複製 OAuth Client ID 與 Secret + +#### LinkedIn(兩者皆需) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. 前往 [LinkedIn Developers](https://developer.linkedin.com/) +2. 建立應用程式 +3. 新增 OAuth 2.0 範圍 +4. 複製 Client ID 與 Client Secret + +--- +## 使用容器執行 + +儲存庫根目錄提供 Docker 設定,內含 Python 3.10、OCR 用的 Tesseract 等關鍵系統套件,以及 `environment.yml`/`requirements.txt` 中定義的所有 Python 相依套件,讓代理人可在隔離環境中穩定執行。 + +以下是透過容器執行代理人的設定說明。 + +### 建置映像檔 + +於儲存庫根目錄執行: + +```bash +docker build -t craftbot . +``` + +### 執行容器 + +映像檔預設會以 `python -m app.main` 啟動代理人。若要以互動方式執行: + +```bash +docker run --rm -it craftbot +``` + +若需傳入環境變數,可透過 env 檔(例如以 `.env.example` 為基礎): + +```bash +docker run --rm -it --env-file .env craftbot +``` + +可使用 `-v` 掛載需要保存在容器外的目錄(如資料或快取資料夾),並依部署需求調整連接埠或其他旗標。映像檔內建 OCR(`tesseract`)、螢幕自動化(`pyautogui`、`mss`、X11 工具與虛擬 framebuffer)以及常見 HTTP 用戶端等系統相依,能讓代理人在容器中處理檔案、網路 API 與 GUI 自動化。 + +### 啟用 GUI/螢幕自動化 + +GUI 動作(滑鼠/鍵盤事件、截圖)需要 X11 伺服器。你可以連接到主機顯示或使用 `xvfb` 以無頭方式執行: + +* 使用主機顯示(需 Linux 搭配 X11): + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + 針對代理人需要讀寫的資料夾,可再新增 `-v` 掛載。 + +* 以虛擬顯示無頭執行: + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +映像檔預設使用 Python 3.10,並內建 `environment.yml`/`requirements.txt` 中的 Python 相依套件,因此 `python -m app.main` 可直接運作。 + +--- + +## 🤝 如何貢獻 + +歡迎提交 PR!詳細流程(fork → 由 `dev` 建分支 → 提 PR)請見 [CONTRIBUTING.md](CONTRIBUTING.md)。所有 Pull Request 都會自動執行 lint 與 smoke-test CI。如果你有任何疑問,或想更快速地溝通,歡迎加入 [Discord](https://discord.gg/ZN9YHc37HG) 或寄信至 thamyikfoong(at)craftos.net。 + +## 🧾 授權條款 + +本專案採用 [MIT 授權條款](LICENSE)。你可以自由使用、部署並商業化本專案(如需散佈或商業化,請註明出處)。 + +--- + +## ⭐ 致謝 + +本專案由 [CraftOS](https://craftos.net/) 與貢獻者 [@zfoong](https://github.com/zfoong)、[@ahmad-ajmal](https://github.com/ahmad-ajmal) 共同開發與維護。 +如果你覺得 **CraftBot** 好用,歡迎為儲存庫按下 ⭐ 並分享給更多人! From 506bca1bc21e69b5a2ae1171e4992bf01543d7f1 Mon Sep 17 00:00:00 2001 From: zfoong Date: Tue, 21 Apr 2026 01:30:42 +0900 Subject: [PATCH 13/81] README:add more languages support --- README.cn.md | 2 +- README.de.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ README.es.md | 2 +- README.fr.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ README.ja.md | 2 +- README.ko.md | 2 +- README.md | 2 +- README.pt-BR.md | 486 ++++++++++++++++++++++++++++++++++++++++++++++++ README.zh-TW.md | 2 +- 9 files changed, 1464 insertions(+), 6 deletions(-) create mode 100644 README.de.md create mode 100644 README.fr.md create mode 100644 README.pt-BR.md diff --git a/README.cn.md b/README.cn.md index 481581b7..e4a9525d 100644 --- a/README.cn.md +++ b/README.cn.md @@ -27,7 +27,7 @@

- English | 日本語 | 繁體中文 | 한국어 | Español + English | 日本語 | 繁體中文 | 한국어 | Español | Português | Français | Deutsch

## 🚀 概览 diff --git a/README.de.md b/README.de.md new file mode 100644 index 00000000..e1a67555 --- /dev/null +++ b/README.de.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 繁體中文 | 한국어 | Español | Português | Français +

+ +## 🚀 Überblick +

+CraftBot ist dein persönlicher KI-Assistent, der auf deinem Rechner lebt und rund um die Uhr für dich arbeitet. +

+ +Er interpretiert Aufgaben autonom, plant Aktionen und führt sie aus, um deine Ziele zu erreichen. +Er lernt deine Vorlieben und Ziele kennen und hilft dir proaktiv dabei, Aufgaben zu planen und anzustoßen, damit du deine Lebensziele erreichst. +MCPs, Skills und Integrationen mit externen Apps werden unterstützt. + +CraftBot wartet auf deine Befehle. Richte jetzt deinen eigenen CraftBot ein. + +
+ CraftBot Overview +
+ +--- + +## ✨ Funktionen + +- **Bring Your Own Key (BYOK)** — Flexibles LLM-Provider-System mit Unterstützung für OpenAI, Google Gemini, Anthropic Claude, BytePlus und lokale Ollama-Modelle. Wechsle Anbieter mühelos. +- **Speichersystem** — Destilliert und konsolidiert um Mitternacht die Ereignisse des Tages. +- **Proaktiver Agent** — Lernt deine Vorlieben, Gewohnheiten und Lebensziele kennen. Anschließend plant er und startet (selbstverständlich nach Freigabe) Aufgaben, die dir beim Fortschritt helfen. +- **Externe Tool-Integration** — Verbinde dich mit Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord und Telegram (weitere folgen!) mit eingebetteten Zugangsdaten und OAuth-Unterstützung. +- **MCP** — Integration des Model Context Protocol, um die Fähigkeiten des Agents um externe Tools und Dienste zu erweitern. +- **Skills** — Erweiterbares Skill-Framework mit eingebauten Skills für Aufgabenplanung, Recherche, Code-Reviews, Git-Operationen und mehr. +- **Plattformübergreifend** — Vollständige Unterstützung für Windows, macOS und Linux mit plattformspezifischen Code-Varianten und Docker-Containerisierung. + +> [!IMPORTANT] +> **Hinweis zum GUI-Modus:** Der GUI-Modus befindet sich noch in einer experimentellen Phase. Beim Wechsel in den GUI-Modus kann es also zu Problemen kommen. Wir verbessern dieses Feature aktiv. + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 Erste Schritte + +### Voraussetzungen +- Python **3.10+** +- `git` (erforderlich zum Klonen des Repositorys) +- Ein API-Schlüssel für den gewählten LLM-Anbieter (OpenAI, Gemini oder Anthropic) +- `Node.js` **18+** (optional – nur für die Browser-Oberfläche erforderlich) +- `conda` (optional – wenn nicht vorhanden, bietet das Installationsprogramm an, Miniconda automatisch zu installieren) + +### Schnellinstallation + +```bash +# Repository klonen +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# Abhängigkeiten installieren +python install.py + +# Agent starten +python run.py +``` + +Das war's! Beim ersten Start wirst du durch die Einrichtung deiner API-Schlüssel geführt. + +**Hinweis:** Wenn Node.js nicht installiert ist, führt dich das Installationsprogramm Schritt für Schritt durch die Installation. Du kannst den Browser-Modus auch überspringen und stattdessen die TUI verwenden (siehe Modi unten). + +### Was kannst du direkt danach tun? +- Natürlich mit dem Agent sprechen +- Ihn komplexe, mehrstufige Aufgaben ausführen lassen +- `/help` eingeben, um verfügbare Befehle zu sehen +- Dich mit Google, Slack, Notion und mehr verbinden + +### 🖥️ Schnittstellenmodi + +
+ CraftOS Banner +
+ +CraftBot unterstützt mehrere UI-Modi. Wähle nach deinen Vorlieben: + +| Modus | Befehl | Voraussetzungen | Empfohlen für | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | Moderne Web-Oberfläche, am einfachsten | +| **TUI** | `python run.py --tui` | Keine | Terminal-UI, ohne Abhängigkeiten | +| **CLI** | `python run.py --cli` | Keine | Kommandozeile, leichtgewichtig | +| **GUI** | `python run.py --gui` | `install.py --gui` | Desktop-Automatisierung mit visuellem Feedback | + +Der **Browser-Modus** ist Standard und wird empfohlen. Ohne Node.js gibt dir das Installationsprogramm eine Anleitung – alternativ kannst du den **TUI-Modus** nutzen. + +--- + +## 🧩 Architekturüberblick + +| Komponente | Beschreibung | +|-----------|-------------| +| **Agent Base** | Zentrale Orchestrierungsschicht, die den Task-Lifecycle verwaltet, zwischen Komponenten koordiniert und die Haupt-Agenten-Schleife steuert. | +| **LLM Interface** | Einheitliche Schnittstelle mit Unterstützung mehrerer LLM-Anbieter (OpenAI, Gemini, Anthropic, BytePlus, Ollama). | +| **Context Engine** | Erzeugt optimierte Prompts mit KV-Cache-Unterstützung. | +| **Action Manager** | Ruft Aktionen aus der Bibliothek ab und führt sie aus. Eigene Aktionen lassen sich leicht erweitern. | +| **Action Router** | Wählt intelligent die am besten passende Aktion auf Basis der Task-Anforderungen und löst Eingabeparameter bei Bedarf über das LLM auf. | +| **Event Stream** | Echtzeit-Event-Publishing-System für Fortschrittsverfolgung, UI-Updates und Ausführungs-Monitoring. | +| **Memory Manager** | RAG-basiertes semantisches Gedächtnis mit ChromaDB. Übernimmt Memory-Chunking, Embedding, Retrieval und inkrementelle Updates. | +| **State Manager** | Globales State-Management zur Verfolgung von Ausführungskontext, Gesprächshistorie und Laufzeitkonfiguration. | +| **Task Manager** | Verwaltet Task-Definitionen, ermöglicht einfache und komplexe Task-Modi, erstellt To-dos und verfolgt mehrstufige Workflows. | +| **Skill Manager** | Lädt einsteckbare Skills und injiziert sie in den Agent-Kontext. | +| **MCP Adapter** | Model Context Protocol Integration, die MCP-Tools in native Aktionen umwandelt. | +| **TUI Interface** | Textual-basierte Terminal-Benutzeroberfläche für interaktive Kommandozeilennutzung. | +| **GUI Module** | Experimentelle GUI-Automatisierung mit Docker-Containern, OmniParser zur UI-Elementerkennung und Gradio-Client. | + +--- + +## 🔜 Roadmap + +- [X] **Memory-Modul** — Fertig. +- [ ] **Externe Tool-Integration** — Wir fügen noch weitere hinzu! +- [X] **MCP-Schicht** — Fertig. +- [X] **Skill-Schicht** — Fertig. +- [X] **Proaktives Verhalten** — In Arbeit + +--- + +## 🖥️ GUI-Modus (optional) + +Der GUI-Modus ermöglicht Bildschirmautomatisierung – der Agent kann eine Desktop-Umgebung sehen und mit ihr interagieren. Das ist optional und erfordert zusätzliche Einrichtung. + +```bash +# Mit GUI-Unterstützung installieren (via pip, ohne conda) +python install.py --gui + +# Mit GUI-Unterstützung und conda installieren +python install.py --gui --conda + +# Im GUI-Modus starten +python run.py --gui +``` + +> [!NOTE] +> Der GUI-Modus ist experimentell und benötigt zusätzliche Abhängigkeiten (~4 GB für Modellgewichte). Wenn du keine Desktop-Automatisierung brauchst, überspringe das und verwende den Browser-/TUI-Modus, der keine zusätzlichen Abhängigkeiten hat. + +--- + +## 📋 Befehlsreferenz + +### install.py + +| Flag | Beschreibung | +|------|-------------| +| `--gui` | GUI-Komponenten installieren (OmniParser) | +| `--conda` | conda-Umgebung nutzen (optional) | +| `--cpu-only` | CPU-only PyTorch installieren (mit `--gui`) | + +### run.py + +| Flag | Beschreibung | +|------|-------------| +| (keines) | Im **Browser**-Modus ausführen (empfohlen, Node.js erforderlich) | +| `--tui` | Im **Terminal-UI**-Modus ausführen (keine Abhängigkeiten nötig) | +| `--cli` | Im **CLI**-Modus ausführen (leichtgewichtig) | +| `--gui` | GUI-Automatisierungsmodus aktivieren (setzt vorheriges `install.py --gui` voraus) | + +### service.py + +| Befehl | Beschreibung | +|---------|-------------| +| `install` | Abhängigkeiten installieren, Autostart registrieren und CraftBot starten | +| `start` | CraftBot im Hintergrund starten | +| `stop` | CraftBot stoppen | +| `restart` | Stoppen und neu starten | +| `status` | Laufstatus und Autostart-Status anzeigen | +| `logs [-n N]` | Die letzten N Log-Zeilen anzeigen (Standard: 50) | +| `uninstall` | Autostart-Registrierung entfernen | + +**Installationsbeispiele:** +```bash +# Einfache pip-Installation (ohne conda) +python install.py + +# Mit GUI-Unterstützung (via pip, ohne conda) +python install.py --gui + +# Mit GUI auf CPU-only-Systemen (via pip, ohne conda) +python install.py --gui --cpu-only + +# Mit conda-Umgebung (empfohlen für conda-Nutzer) +python install.py --conda + +# Mit GUI-Unterstützung und conda +python install.py --gui --conda + +# Mit GUI auf CPU-only-Systemen mit conda +python install.py --gui --conda --cpu-only +``` + +**CraftBot ausführen:** + +```powershell +# Browser-Modus (Standard, Node.js erforderlich) +python run.py + +# TUI-Modus (kein Node.js nötig) +python run.py --tui + +# CLI-Modus (leichtgewichtig) +python run.py --cli + +# Mit GPU/GUI-Modus +python run.py --gui + +# Mit conda-Umgebung +conda run -n craftbot python run.py + +# Oder mit vollständigem Pfad, falls conda nicht im PATH ist +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS (Bash):** +```bash +# Browser-Modus (Standard, Node.js erforderlich) +python run.py + +# TUI-Modus (kein Node.js nötig) +python run.py --tui + +# CLI-Modus (leichtgewichtig) +python run.py --cli + +# Mit GPU/GUI-Modus +python run.py --gui + +# Mit conda-Umgebung +conda run -n craftbot python run.py +``` + +### 🔧 Hintergrunddienst (empfohlen) + +Betreibe CraftBot als Hintergrunddienst, sodass er auch nach dem Schließen des Terminals weiterläuft. Eine Desktop-Verknüpfung wird automatisch erstellt, damit du den Browser jederzeit wieder öffnen kannst. + +```bash +# Abhängigkeiten installieren, Autostart bei Anmeldung registrieren und CraftBot starten +python service.py install +``` + +Das war's. Das Terminal schließt sich von selbst, CraftBot läuft im Hintergrund und der Browser öffnet sich automatisch. + +```bash +# Weitere Dienstbefehle: +python service.py start # CraftBot im Hintergrund starten +python service.py status # Prüfen, ob er läuft +python service.py stop # CraftBot stoppen +python service.py restart # CraftBot neu starten +python service.py logs # Aktuelle Log-Ausgabe ansehen +``` + +| Befehl | Beschreibung | +|---------|-------------| +| `python service.py install` | Abhängigkeiten installieren, Autostart bei Anmeldung registrieren, CraftBot starten, Browser öffnen und Terminal automatisch schließen | +| `python service.py start` | CraftBot im Hintergrund starten – startet automatisch neu, wenn er bereits läuft (Terminal schließt sich selbst) | +| `python service.py stop` | CraftBot stoppen | +| `python service.py restart` | CraftBot stoppen und starten | +| `python service.py status` | Prüfen, ob CraftBot läuft und ob Autostart aktiviert ist | +| `python service.py logs` | Aktuelle Log-Ausgabe anzeigen (`-n 100` für mehr Zeilen) | +| `python service.py uninstall` | CraftBot stoppen, Autostart entfernen, pip-Pakete deinstallieren und pip-Cache leeren | + +> [!TIP] +> Nach `service.py start` oder `service.py install` wird automatisch eine **CraftBot-Desktop-Verknüpfung** erstellt. Hast du den Browser versehentlich geschlossen, doppelklicke die Verknüpfung, um ihn wieder zu öffnen. + +> [!NOTE] +> **Installation:** Das Installationsprogramm gibt nun klare Hinweise, falls Abhängigkeiten fehlen. Wird Node.js nicht gefunden, wirst du zur Installation aufgefordert oder kannst in den TUI-Modus wechseln. Die Installation erkennt die GPU-Verfügbarkeit automatisch und fällt bei Bedarf auf den CPU-Modus zurück. + +> [!TIP] +> **Ersteinrichtung:** CraftBot führt dich durch einen Onboarding-Ablauf, um API-Schlüssel, den Agentennamen, MCPs und Skills zu konfigurieren. + +> [!NOTE] +> **Playwright Chromium:** Optional für die WhatsApp-Web-Integration. Schlägt die Installation fehl, funktioniert der Agent weiterhin für andere Aufgaben. Manuell nachinstallieren mit: `playwright install chromium` + +--- + +## � Fehlerbehebung und häufige Probleme + +### Fehlendes Node.js (für den Browser-Modus) +Erscheint **"npm not found in PATH"** beim Ausführen von `python run.py`: +1. Von [nodejs.org](https://nodejs.org/) herunterladen (LTS-Version wählen) +2. Installieren und das Terminal neu starten +3. `python run.py` erneut ausführen + +**Alternative:** TUI-Modus verwenden (kein Node.js nötig): +```bash +python run.py --tui +``` + +### Installation schlägt bei Abhängigkeiten fehl +Das Installationsprogramm liefert jetzt detaillierte Fehlermeldungen mit Lösungen. Wenn die Installation fehlschlägt: +- **Python-Version prüfen:** Stelle sicher, dass du Python 3.10+ hast (`python --version`) +- **Internet prüfen:** Abhängigkeiten werden während der Installation heruntergeladen +- **pip-Cache leeren:** `pip install --upgrade pip` ausführen und erneut versuchen + +### Probleme bei der Playwright-Installation +Die Playwright-Chromium-Installation ist optional. Bei einem Fehlschlag: +- Der Agent **funktioniert weiterhin** für andere Aufgaben +- Du kannst ihn überspringen oder später installieren: `playwright install chromium` +- Nur für die WhatsApp-Web-Integration erforderlich + +### GPU-/CUDA-Probleme +Das Installationsprogramm erkennt die GPU-Verfügbarkeit automatisch: +- Schlägt die CUDA-Installation fehl, wird automatisch in den CPU-Modus gewechselt +- Für manuelle CPU-Einrichtung: `python install.py --gui --cpu-only` + +Ausführliche Hinweise zur Fehlerbehebung findest du in [INSTALLATION_FIX.md](INSTALLATION_FIX.md). + +--- + +Der Agent kann sich über OAuth mit verschiedenen Diensten verbinden. Release-Builds enthalten eingebettete Zugangsdaten, du kannst aber auch deine eigenen verwenden. + +### Schnellstart + +Für Release-Builds mit eingebetteten Zugangsdaten: +``` +/google login # Google Workspace verbinden +/zoom login # Zoom verbinden +/slack invite # Slack verbinden +/notion invite # Notion verbinden +/linkedin login # LinkedIn verbinden +``` + +### Dienst-Details + +| Dienst | Auth-Typ | Befehl | Secret nötig? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | Nein (PKCE) | +| Zoom | PKCE | `/zoom login` | Nein (PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | Ja | +| Notion | OAuth 2.0 | `/notion invite` | Ja | +| LinkedIn | OAuth 2.0 | `/linkedin login` | Ja | + +### Eigene Zugangsdaten verwenden + +Möchtest du deine eigenen OAuth-Zugangsdaten verwenden, trage sie in deine `.env`-Datei ein: + +#### Google (PKCE – nur Client ID nötig) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. Gehe zur [Google Cloud Console](https://console.cloud.google.com/) +2. Aktiviere die APIs für Gmail, Calendar, Drive und People +3. Erstelle OAuth-Zugangsdaten vom Typ **Desktop app** +4. Kopiere die Client ID (für PKCE ist kein Secret nötig) + +#### Zoom (PKCE – nur Client ID nötig) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. Gehe zum [Zoom Marketplace](https://marketplace.zoom.us/) +2. Erstelle eine OAuth-App +3. Kopiere die Client ID + +#### Slack (beides erforderlich) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. Gehe zur [Slack API](https://api.slack.com/apps) +2. Erstelle eine neue App +3. Füge OAuth-Scopes hinzu: `chat:write`, `channels:read`, `users:read` usw. +4. Kopiere Client ID und Client Secret + +#### Notion (beides erforderlich) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. Gehe zu [Notion Developers](https://developers.notion.com/) +2. Erstelle eine neue Integration (Public integration) +3. Kopiere OAuth Client ID und Secret + +#### LinkedIn (beides erforderlich) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. Gehe zu [LinkedIn Developers](https://developer.linkedin.com/) +2. Erstelle eine App +3. Füge OAuth-2.0-Scopes hinzu +4. Kopiere Client ID und Client Secret + +--- +## Mit Container ausführen + +Das Repository-Root enthält eine Docker-Konfiguration mit Python 3.10, wichtigen Systempaketen (inklusive Tesseract für OCR) und allen in `environment.yml`/`requirements.txt` definierten Python-Abhängigkeiten, damit der Agent konsistent in isolierten Umgebungen läuft. + +Nachfolgend die Einrichtungsanleitung, um unseren Agent mit Container auszuführen. + +### Image bauen + +Im Repository-Root: + +```bash +docker build -t craftbot . +``` + +### Container ausführen + +Das Image ist so konfiguriert, dass der Agent standardmäßig mit `python -m app.main` gestartet wird. Für eine interaktive Ausführung: + +```bash +docker run --rm -it craftbot +``` + +Wenn du Umgebungsvariablen bereitstellen musst, übergib eine env-Datei (z. B. basierend auf `.env.example`): + +```bash +docker run --rm -it --env-file .env craftbot +``` + +Mounte alle Verzeichnisse, die außerhalb des Containers persistent sein sollen (etwa Daten- oder Cache-Ordner), mit `-v`, und passe Ports oder weitere Flags nach Bedarf an dein Deployment an. Das Image enthält Systemabhängigkeiten für OCR (`tesseract`), Bildschirmautomatisierung (`pyautogui`, `mss`, X11-Tools und einen virtuellen Framebuffer) sowie gängige HTTP-Clients, damit der Agent im Container mit Dateien, Netzwerk-APIs und GUI-Automatisierung arbeiten kann. + +### GUI-/Bildschirmautomatisierung aktivieren + +GUI-Aktionen (Maus-/Tastaturereignisse, Screenshots) benötigen einen X11-Server. Du kannst dich an das Host-Display anhängen oder headless mit `xvfb` laufen lassen: + +* Host-Display verwenden (erfordert Linux mit X11): + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + Füge weitere `-v`-Mounts für Ordner hinzu, in die der Agent lesen/schreiben soll. + +* Headless mit virtuellem Display ausführen: + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +Standardmäßig nutzt das Image Python 3.10 und bündelt die Python-Abhängigkeiten aus `environment.yml`/`requirements.txt`, sodass `python -m app.main` sofort funktioniert. + +--- + +## 🤝 Mitwirken + +PRs sind willkommen! Siehe [CONTRIBUTING.md](CONTRIBUTING.md) für den Workflow (Fork → Branch von `dev` → PR). Alle Pull Requests durchlaufen automatisch Lint- und Smoke-Test-CI. Für Fragen oder schnelleren Austausch komm auf unseren [Discord](https://discord.gg/ZN9YHc37HG) oder schreib an thamyikfoong(at)craftos.net. + +## 🧾 Lizenz + +Dieses Projekt steht unter der [MIT-Lizenz](LICENSE). Du darfst das Projekt frei nutzen, hosten und monetarisieren (bei Weiterverbreitung und Monetarisierung muss dieses Projekt genannt werden). + +--- + +## ⭐ Danksagung + +Entwickelt und gepflegt von [CraftOS](https://craftos.net/) sowie den Contributors [@zfoong](https://github.com/zfoong) und [@ahmad-ajmal](https://github.com/ahmad-ajmal). +Wenn dir **CraftBot** nützlich ist, gib dem Repository bitte einen ⭐ und teile es mit anderen! diff --git a/README.es.md b/README.es.md index 9a36feea..91106e30 100644 --- a/README.es.md +++ b/README.es.md @@ -27,7 +27,7 @@

- English | 日本語 | 简体中文 | 繁體中文 | 한국어 + English | 日本語 | 简体中文 | 繁體中文 | 한국어 | Português | Français | Deutsch

## 🚀 Descripción general diff --git a/README.fr.md b/README.fr.md new file mode 100644 index 00000000..38bf7c61 --- /dev/null +++ b/README.fr.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 繁體中文 | 한국어 | Español | Português | Deutsch +

+ +## 🚀 Aperçu +

+CraftBot est votre Assistant IA Personnel qui vit à l'intérieur de votre machine et travaille 24h/24 pour vous. +

+ +Il interprète les tâches de manière autonome, planifie les actions et les exécute pour atteindre vos objectifs. +Il apprend vos préférences et objectifs, et vous aide de façon proactive à planifier et lancer des tâches pour atteindre vos buts de vie. +Les MCP, les Skills et les intégrations d'applications externes sont pris en charge. + +CraftBot attend vos ordres. Configurez dès maintenant votre propre CraftBot. + +
+ CraftBot Overview +
+ +--- + +## ✨ Fonctionnalités + +- **Bring Your Own Key (BYOK)** — Système flexible de fournisseurs LLM prenant en charge OpenAI, Google Gemini, Anthropic Claude, BytePlus et les modèles locaux Ollama. Basculez facilement entre fournisseurs. +- **Système de mémoire** — Distille et consolide les événements de la journée à minuit. +- **Agent proactif** — Apprend vos préférences, habitudes et objectifs de vie. Puis planifie et lance des tâches (avec votre accord, bien sûr) pour vous aider à progresser. +- **Intégration d'outils externes** — Connectez-vous à Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord et Telegram (d'autres à venir !) avec des identifiants intégrés et le support OAuth. +- **MCP** — Intégration du Model Context Protocol pour étendre les capacités de l'agent avec des outils et services externes. +- **Skills** — Framework de skills extensible avec des skills intégrées pour la planification de tâches, la recherche, la revue de code, les opérations git, etc. +- **Multiplateforme** — Prise en charge complète de Windows, macOS et Linux avec des variantes de code spécifiques à chaque plateforme et la conteneurisation Docker. + +> [!IMPORTANT] +> **Remarque sur le mode GUI :** Le mode GUI est encore en phase expérimentale. Vous pourriez rencontrer des problèmes lorsque l'agent bascule en mode GUI. Nous améliorons activement cette fonctionnalité. + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 Pour commencer + +### Prérequis +- Python **3.10+** +- `git` (nécessaire pour cloner le dépôt) +- Une clé API pour le fournisseur LLM de votre choix (OpenAI, Gemini ou Anthropic) +- `Node.js` **18+** (optionnel — requis uniquement pour l'interface navigateur) +- `conda` (optionnel — s'il est introuvable, l'installateur propose d'installer Miniconda automatiquement) + +### Installation rapide + +```bash +# Cloner le dépôt +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# Installer les dépendances +python install.py + +# Lancer l'agent +python run.py +``` + +C'est tout ! La première exécution vous guidera dans la configuration de vos clés API. + +**Remarque :** Si Node.js n'est pas installé, l'installateur fournira des instructions pas à pas. Vous pouvez aussi ignorer le mode navigateur et utiliser la TUI (voir les modes ci-dessous). + +### Que pouvez-vous faire tout de suite ? +- Discuter avec l'agent naturellement +- Lui demander d'exécuter des tâches complexes en plusieurs étapes +- Taper `/help` pour voir les commandes disponibles +- Vous connecter à Google, Slack, Notion et plus + +### 🖥️ Modes d'interface + +
+ CraftOS Banner +
+ +CraftBot propose plusieurs modes d'UI. Choisissez selon vos préférences : + +| Mode | Commande | Prérequis | Idéal pour | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | Interface web moderne, la plus simple à utiliser | +| **TUI** | `python run.py --tui` | Aucun | UI en terminal, aucune dépendance requise | +| **CLI** | `python run.py --cli` | Aucun | Ligne de commande, léger | +| **GUI** | `python run.py --gui` | `install.py --gui` | Automatisation de bureau avec retour visuel | + +Le **mode navigateur** est le mode par défaut et recommandé. Si vous n'avez pas Node.js, l'installateur vous guidera pour l'installer, ou vous pouvez utiliser le **mode TUI**. + +--- + +## 🧩 Aperçu de l'architecture + +| Composant | Description | +|-----------|-------------| +| **Agent Base** | Couche d'orchestration centrale qui gère le cycle de vie des tâches, coordonne les composants et pilote la boucle agentique principale. | +| **LLM Interface** | Interface unifiée prenant en charge plusieurs fournisseurs LLM (OpenAI, Gemini, Anthropic, BytePlus, Ollama). | +| **Context Engine** | Génère des prompts optimisés avec support du cache KV. | +| **Action Manager** | Récupère et exécute les actions depuis la bibliothèque. Les actions personnalisées sont faciles à étendre. | +| **Action Router** | Sélectionne intelligemment l'action la plus adaptée aux exigences de la tâche et résout les paramètres d'entrée via le LLM au besoin. | +| **Event Stream** | Système de publication d'événements en temps réel pour le suivi de la progression des tâches, les mises à jour d'UI et le monitoring d'exécution. | +| **Memory Manager** | Mémoire sémantique basée sur le RAG via ChromaDB. Gère le découpage, l'embedding, la récupération et les mises à jour incrémentales. | +| **State Manager** | Gestion globale de l'état pour suivre le contexte d'exécution de l'agent, l'historique de conversation et la configuration d'exécution. | +| **Task Manager** | Gère les définitions de tâches, permet des modes simples et complexes, crée des to-dos et suit les workflows multi-étapes. | +| **Skill Manager** | Charge et injecte des skills enfichables dans le contexte de l'agent. | +| **MCP Adapter** | Intégration Model Context Protocol qui convertit les outils MCP en actions natives. | +| **TUI Interface** | Interface utilisateur en terminal construite avec le framework Textual pour une utilisation interactive en ligne de commande. | +| **GUI Module** | Automatisation GUI expérimentale utilisant des conteneurs Docker, OmniParser pour la détection d'éléments UI et le client Gradio. | + +--- + +## 🔜 Roadmap + +- [X] **Module de mémoire** — Terminé. +- [ ] **Intégration d'outils externes** — En cours d'ajout ! +- [X] **Couche MCP** — Terminée. +- [X] **Couche Skills** — Terminée. +- [X] **Comportement proactif** — En cours + +--- + +## 🖥️ Mode GUI (optionnel) + +Le mode GUI active l'automatisation d'écran — l'agent peut voir et interagir avec un environnement de bureau. C'est optionnel et nécessite une configuration supplémentaire. + +```bash +# Installer avec le support GUI (via pip, sans conda) +python install.py --gui + +# Installer avec le support GUI et conda +python install.py --gui --conda + +# Lancer en mode GUI +python run.py --gui +``` + +> [!NOTE] +> Le mode GUI est expérimental et nécessite des dépendances supplémentaires (~4 Go pour les poids du modèle). Si vous n'avez pas besoin d'automatisation de bureau, passez cette étape et utilisez le mode Browser/TUI, qui n'a pas de dépendances additionnelles. + +--- + +## 📋 Référence des commandes + +### install.py + +| Flag | Description | +|------|-------------| +| `--gui` | Installer les composants GUI (OmniParser) | +| `--conda` | Utiliser un environnement conda (optionnel) | +| `--cpu-only` | Installer PyTorch en version CPU uniquement (avec `--gui`) | + +### run.py + +| Flag | Description | +|------|-------------| +| (aucun) | Lancer en mode **Browser** (recommandé, nécessite Node.js) | +| `--tui` | Lancer en mode **Terminal UI** (aucune dépendance) | +| `--cli` | Lancer en mode **CLI** (léger) | +| `--gui` | Activer le mode automatisation GUI (nécessite `install.py --gui` au préalable) | + +### service.py + +| Commande | Description | +|---------|-------------| +| `install` | Installe les deps, enregistre le démarrage automatique et lance CraftBot | +| `start` | Démarre CraftBot en arrière-plan | +| `stop` | Arrête CraftBot | +| `restart` | Arrête puis redémarre | +| `status` | Affiche l'état d'exécution et celui du démarrage automatique | +| `logs [-n N]` | Affiche les N dernières lignes de log (par défaut : 50) | +| `uninstall` | Supprime l'enregistrement du démarrage automatique | + +**Exemples d'installation :** +```bash +# Installation simple via pip (sans conda) +python install.py + +# Avec support GUI (via pip, sans conda) +python install.py --gui + +# Avec GUI sur systèmes CPU uniquement (via pip, sans conda) +python install.py --gui --cpu-only + +# Avec environnement conda (recommandé pour les utilisateurs de conda) +python install.py --conda + +# Avec support GUI et conda +python install.py --gui --conda + +# Avec GUI sur systèmes CPU uniquement, avec conda +python install.py --gui --conda --cpu-only +``` + +**Exécuter CraftBot :** + +```powershell +# Mode Browser (par défaut, nécessite Node.js) +python run.py + +# Mode TUI (pas de Node.js nécessaire) +python run.py --tui + +# Mode CLI (léger) +python run.py --cli + +# Mode GPU/GUI +python run.py --gui + +# Avec environnement conda +conda run -n craftbot python run.py + +# Ou en utilisant le chemin complet si conda n'est pas dans le PATH +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS (Bash) :** +```bash +# Mode Browser (par défaut, nécessite Node.js) +python run.py + +# Mode TUI (pas de Node.js nécessaire) +python run.py --tui + +# Mode CLI (léger) +python run.py --cli + +# Mode GPU/GUI +python run.py --gui + +# Avec environnement conda +conda run -n craftbot python run.py +``` + +### 🔧 Service en arrière-plan (recommandé) + +Exécutez CraftBot en tant que service en arrière-plan pour qu'il continue de fonctionner même après la fermeture du terminal. Un raccourci de bureau est créé automatiquement pour rouvrir le navigateur à tout moment. + +```bash +# Installer les dépendances, enregistrer le démarrage automatique à la connexion et lancer CraftBot +python service.py install +``` + +C'est tout. Le terminal se ferme tout seul, CraftBot tourne en arrière-plan et le navigateur s'ouvre automatiquement. + +```bash +# Autres commandes du service : +python service.py start # Démarre CraftBot en arrière-plan +python service.py status # Vérifie s'il tourne +python service.py stop # Arrête CraftBot +python service.py restart # Redémarre CraftBot +python service.py logs # Affiche les logs récents +``` + +| Commande | Description | +|---------|-------------| +| `python service.py install` | Installe les dépendances, enregistre le démarrage automatique à la connexion, lance CraftBot, ouvre le navigateur et ferme le terminal automatiquement | +| `python service.py start` | Démarre CraftBot en arrière-plan — redémarre automatiquement s'il est déjà lancé (le terminal se ferme tout seul) | +| `python service.py stop` | Arrête CraftBot | +| `python service.py restart` | Arrête puis démarre CraftBot | +| `python service.py status` | Vérifie si CraftBot tourne et si le démarrage automatique est activé | +| `python service.py logs` | Affiche les logs récents (`-n 100` pour plus de lignes) | +| `python service.py uninstall` | Arrête CraftBot, supprime le démarrage automatique, désinstalle les paquets pip et purge le cache pip | + +> [!TIP] +> Après `service.py start` ou `service.py install`, un **raccourci CraftBot sur le bureau** est créé automatiquement. Si vous fermez le navigateur par accident, double-cliquez sur le raccourci pour le rouvrir. + +> [!NOTE] +> **Installation :** L'installateur fournit maintenant des indications claires si des dépendances manquent. Si Node.js est introuvable, on vous proposera de l'installer ou de basculer en mode TUI. L'installation détecte automatiquement la disponibilité du GPU et bascule en mode CPU si nécessaire. + +> [!TIP] +> **Première configuration :** CraftBot vous guidera dans une séquence d'onboarding pour configurer les clés API, le nom de l'agent, les MCP et les Skills. + +> [!NOTE] +> **Playwright Chromium :** Optionnel pour l'intégration WhatsApp Web. Si l'installation échoue, l'agent fonctionnera toujours pour les autres tâches. Installez-le manuellement plus tard avec : `playwright install chromium` + +--- + +## � Dépannage et problèmes courants + +### Node.js manquant (pour le mode navigateur) +Si vous voyez **"npm not found in PATH"** en lançant `python run.py` : +1. Téléchargez depuis [nodejs.org](https://nodejs.org/) (choisissez la version LTS) +2. Installez et redémarrez votre terminal +3. Relancez `python run.py` + +**Alternative :** Utilisez le mode TUI (Node.js non requis) : +```bash +python run.py --tui +``` + +### L'installation échoue sur les dépendances +L'installateur fournit désormais des messages d'erreur détaillés avec des solutions. Si l'installation échoue : +- **Vérifiez la version de Python :** assurez-vous d'avoir Python 3.10+ (`python --version`) +- **Vérifiez votre connexion :** les dépendances sont téléchargées pendant l'installation +- **Videz le cache pip :** `pip install --upgrade pip` puis réessayez + +### Problèmes d'installation de Playwright +L'installation de Playwright Chromium est optionnelle. En cas d'échec : +- L'agent **continuera de fonctionner** pour les autres tâches +- Vous pouvez l'ignorer ou l'installer plus tard : `playwright install chromium` +- Nécessaire uniquement pour l'intégration WhatsApp Web + +### Problèmes GPU/CUDA +L'installateur détecte automatiquement la disponibilité du GPU : +- En cas d'échec de l'installation CUDA, il bascule automatiquement en mode CPU +- Pour une configuration CPU manuelle : `python install.py --gui --cpu-only` + +Pour un dépannage détaillé, consultez [INSTALLATION_FIX.md](INSTALLATION_FIX.md). + +--- + +L'agent peut se connecter à divers services via OAuth. Les builds de release incluent des identifiants intégrés, mais vous pouvez aussi utiliser les vôtres. + +### Démarrage rapide + +Pour les builds de release avec identifiants intégrés : +``` +/google login # Connecter Google Workspace +/zoom login # Connecter Zoom +/slack invite # Connecter Slack +/notion invite # Connecter Notion +/linkedin login # Connecter LinkedIn +``` + +### Détails des services + +| Service | Type d'auth | Commande | Secret requis ? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | Non (PKCE) | +| Zoom | PKCE | `/zoom login` | Non (PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | Oui | +| Notion | OAuth 2.0 | `/notion invite` | Oui | +| LinkedIn | OAuth 2.0 | `/linkedin login` | Oui | + +### Utiliser vos propres identifiants + +Si vous préférez utiliser vos propres identifiants OAuth, ajoutez-les à votre fichier `.env` : + +#### Google (PKCE — uniquement le Client ID) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. Allez sur la [Google Cloud Console](https://console.cloud.google.com/) +2. Activez les API Gmail, Calendar, Drive et People +3. Créez des identifiants OAuth de type **Desktop app** +4. Copiez le Client ID (le secret n'est pas requis en PKCE) + +#### Zoom (PKCE — uniquement le Client ID) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. Allez sur le [Zoom Marketplace](https://marketplace.zoom.us/) +2. Créez une application OAuth +3. Copiez le Client ID + +#### Slack (les deux requis) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. Allez sur [Slack API](https://api.slack.com/apps) +2. Créez une nouvelle application +3. Ajoutez les scopes OAuth : `chat:write`, `channels:read`, `users:read`, etc. +4. Copiez le Client ID et le Client Secret + +#### Notion (les deux requis) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. Allez sur [Notion Developers](https://developers.notion.com/) +2. Créez une nouvelle intégration (Public integration) +3. Copiez l'OAuth Client ID et le Secret + +#### LinkedIn (les deux requis) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. Allez sur [LinkedIn Developers](https://developer.linkedin.com/) +2. Créez une application +3. Ajoutez les scopes OAuth 2.0 +4. Copiez le Client ID et le Client Secret + +--- +## Exécuter avec un conteneur + +La racine du dépôt contient une configuration Docker avec Python 3.10, des paquets système clés (dont Tesseract pour l'OCR) et toutes les dépendances Python définies dans `environment.yml`/`requirements.txt`, pour que l'agent s'exécute de façon cohérente dans des environnements isolés. + +Ci-dessous les instructions pour exécuter notre agent en conteneur. + +### Construire l'image + +Depuis la racine du dépôt : + +```bash +docker build -t craftbot . +``` + +### Exécuter le conteneur + +L'image est configurée pour lancer l'agent avec `python -m app.main` par défaut. Pour l'exécuter en mode interactif : + +```bash +docker run --rm -it craftbot +``` + +Si vous devez fournir des variables d'environnement, passez un fichier env (par exemple basé sur `.env.example`) : + +```bash +docker run --rm -it --env-file .env craftbot +``` + +Montez tous les répertoires qui doivent persister en dehors du conteneur (comme les dossiers de données ou cache) via `-v`, et ajustez les ports ou autres flags selon votre déploiement. L'image embarque les dépendances système pour l'OCR (`tesseract`), l'automatisation d'écran (`pyautogui`, `mss`, utilitaires X11 et framebuffer virtuel) et les clients HTTP courants, afin que l'agent puisse travailler avec les fichiers, les API réseau et l'automatisation GUI dans le conteneur. + +### Activer l'automatisation GUI/écran + +Les actions GUI (événements souris/clavier, captures d'écran) nécessitent un serveur X11. Vous pouvez vous attacher à l'affichage hôte ou exécuter en headless avec `xvfb` : + +* Utiliser l'affichage de l'hôte (nécessite Linux avec X11) : + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + Ajoutez d'autres montages `-v` pour les dossiers que l'agent doit lire/écrire. + +* Exécution headless avec un affichage virtuel : + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +Par défaut, l'image utilise Python 3.10 et embarque les dépendances Python de `environment.yml`/`requirements.txt`, donc `python -m app.main` fonctionne immédiatement. + +--- + +## 🤝 Comment contribuer + +Les PR sont les bienvenues ! Voir [CONTRIBUTING.md](CONTRIBUTING.md) pour le workflow (fork → branche depuis `dev` → PR). Toutes les pull requests passent automatiquement par un CI lint + smoke-test. Pour toute question ou une discussion plus rapide, rejoignez-nous sur [Discord](https://discord.gg/ZN9YHc37HG) ou envoyez un email à thamyikfoong(at)craftos.net. + +## 🧾 Licence + +Ce projet est sous [licence MIT](LICENSE). Vous êtes libre d'utiliser, d'héberger et de monétiser ce projet (vous devez créditer ce projet en cas de distribution et de monétisation). + +--- + +## ⭐ Remerciements + +Développé et maintenu par [CraftOS](https://craftos.net/) et les contributeurs [@zfoong](https://github.com/zfoong) et [@ahmad-ajmal](https://github.com/ahmad-ajmal). +Si **CraftBot** vous est utile, mettez une ⭐ au dépôt et partagez-le avec d'autres ! diff --git a/README.ja.md b/README.ja.md index 9bb2a40e..25b4ff7f 100644 --- a/README.ja.md +++ b/README.ja.md @@ -27,7 +27,7 @@

- English | 简体中文 | 繁體中文 | 한국어 | Español + English | 简体中文 | 繁體中文 | 한국어 | Español | Português | Français | Deutsch

## 🚀 概要 diff --git a/README.ko.md b/README.ko.md index 105f7622..9db26cfa 100644 --- a/README.ko.md +++ b/README.ko.md @@ -27,7 +27,7 @@

- English | 日本語 | 简体中文 | 繁體中文 | Español + English | 日本語 | 简体中文 | 繁體中文 | Español | Português | Français | Deutsch

## 🚀 개요 diff --git a/README.md b/README.md index 69087a55..dced9251 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@

- 日本語 | 简体中文 | 繁體中文 | 한국어 | Español + 日本語 | 简体中文 | 繁體中文 | 한국어 | Español | Português | Français | Deutsch

## 🚀 Overview diff --git a/README.pt-BR.md b/README.pt-BR.md new file mode 100644 index 00000000..0143ead9 --- /dev/null +++ b/README.pt-BR.md @@ -0,0 +1,486 @@ + +
+ CraftBot Banner +
+
+ +
+ Windows + macOS + Linux + + + GitHub Repo stars + + + License + + + Discord + +
+
+ +[![SPONSORED BY E2B FOR STARTUPS](https://img.shields.io/badge/SPONSORED%20BY-E2B%20FOR%20STARTUPS-ff8800?style=for-the-badge)](https://e2b.dev/startups) + +CraftBot - Self-hosted proactive AI assistant that lives locally | Product Hunt +
+ +

+ English | 日本語 | 简体中文 | 繁體中文 | 한국어 | Español | Français | Deutsch +

+ +## 🚀 Visão geral +

+O CraftBot é o seu Assistente de IA Pessoal, que vive dentro da sua máquina e trabalha 24/7 para você. +

+ +Ele interpreta tarefas de forma autônoma, planeja ações e as executa para alcançar seus objetivos. +Aprende suas preferências e metas, ajudando-o proativamente a planejar e iniciar tarefas para atingir seus objetivos de vida. +Suporta MCPs, Skills e integrações com aplicativos externos. + +O CraftBot aguarda suas ordens. Configure o seu agora mesmo. + +
+ CraftBot Overview +
+ +--- + +## ✨ Recursos + +- **Bring Your Own Key (BYOK)** — Sistema flexível de provedores de LLM com suporte a OpenAI, Google Gemini, Anthropic Claude, BytePlus e modelos locais do Ollama. Troque de provedor com facilidade. +- **Sistema de Memória** — Destila e consolida os eventos ocorridos durante o dia à meia-noite. +- **Agente Proativo** — Aprende suas preferências, hábitos e metas de vida. Depois, planeja e inicia tarefas (com sua aprovação, claro) para ajudá-lo a evoluir. +- **Integração com ferramentas externas** — Conecte-se a Google Workspace, Slack, Notion, Zoom, LinkedIn, Discord e Telegram (mais a caminho!) com credenciais embutidas e suporte a OAuth. +- **MCP** — Integração com o Model Context Protocol para ampliar as capacidades do agente com ferramentas e serviços externos. +- **Skills** — Framework de skills extensível com skills embutidas para planejamento de tarefas, pesquisa, revisão de código, operações de git e muito mais. +- **Multiplataforma** — Suporte completo para Windows, macOS e Linux, com variantes de código específicas por plataforma e conteinerização via Docker. + +> [!IMPORTANT] +> **Observação sobre o modo GUI:** O modo GUI ainda está em fase experimental. Isso significa que você pode encontrar problemas quando o agente alternar para o modo GUI. Estamos aprimorando este recurso ativamente. + +
+ CraftBot Banner + CraftBot Banner +
+ +--- + + +## 🧰 Começando + +### Pré-requisitos +- Python **3.10+** +- `git` (necessário para clonar o repositório) +- Uma chave de API do provedor LLM escolhido (OpenAI, Gemini ou Anthropic) +- `Node.js` **18+** (opcional — necessário apenas para a interface no navegador) +- `conda` (opcional — se não for encontrado, o instalador pode instalar o Miniconda automaticamente) + +### Instalação rápida + +```bash +# Clone o repositório +git clone https://github.com/CraftOS-dev/CraftBot.git +cd CraftBot + +# Instale as dependências +python install.py + +# Execute o agente +python run.py +``` + +É isso! Na primeira execução, você será guiado para configurar suas chaves de API. + +**Observação:** Se você não tiver o Node.js instalado, o instalador fornecerá instruções passo a passo. Também é possível ignorar o modo navegador e usar a TUI (veja os modos abaixo). + +### O que você pode fazer logo de cara? +- Conversar com o agente de forma natural +- Pedir que ele execute tarefas complexas de várias etapas +- Digitar `/help` para ver os comandos disponíveis +- Conectar-se ao Google, Slack, Notion e muito mais + +### 🖥️ Modos de interface + +
+ CraftOS Banner +
+ +O CraftBot oferece vários modos de UI. Escolha conforme sua preferência: + +| Modo | Comando | Requisitos | Indicado para | +|------|---------|--------------|----------| +| **Browser** | `python run.py` | Node.js 18+ | Interface web moderna, a mais fácil de usar | +| **TUI** | `python run.py --tui` | Nenhum | UI em terminal, sem dependências | +| **CLI** | `python run.py --cli` | Nenhum | Linha de comando, leve | +| **GUI** | `python run.py --gui` | `install.py --gui` | Automação de desktop com feedback visual | + +O **modo Browser** é o padrão e recomendado. Se não tiver o Node.js, o instalador fornecerá instruções de instalação, ou você pode usar o **modo TUI**. + +--- + +## 🧩 Visão geral da arquitetura + +| Componente | Descrição | +|-----------|-------------| +| **Agent Base** | Camada central de orquestração que gerencia o ciclo de vida das tarefas, coordena os componentes e cuida do loop principal do agente. | +| **LLM Interface** | Interface unificada com suporte a vários provedores de LLM (OpenAI, Gemini, Anthropic, BytePlus, Ollama). | +| **Context Engine** | Gera prompts otimizados com suporte a KV-cache. | +| **Action Manager** | Recupera e executa ações da biblioteca. Ações personalizadas são fáceis de estender. | +| **Action Router** | Seleciona de forma inteligente a ação que melhor corresponde aos requisitos da tarefa e resolve parâmetros de entrada via LLM quando necessário. | +| **Event Stream** | Sistema de publicação de eventos em tempo real para acompanhar o progresso das tarefas, atualizar a UI e monitorar a execução. | +| **Memory Manager** | Memória semântica baseada em RAG usando o ChromaDB. Lida com chunking, embeddings, recuperação e atualizações incrementais. | +| **State Manager** | Gerenciamento global de estado para rastrear contexto de execução do agente, histórico de conversas e configurações de runtime. | +| **Task Manager** | Gerencia definições de tarefas, habilita modos simples e complexos, cria to-dos e rastreia workflows multi-etapa. | +| **Skill Manager** | Carrega e injeta skills plugáveis no contexto do agente. | +| **MCP Adapter** | Integração com o Model Context Protocol que converte ferramentas MCP em ações nativas. | +| **TUI Interface** | Interface de usuário no terminal construída com o framework Textual para operação interativa por linha de comando. | +| **GUI Module** | Automação GUI experimental usando contêineres Docker, OmniParser para detecção de elementos de UI e cliente Gradio. | + +--- + +## 🔜 Roadmap + +- [X] **Módulo de memória** — Concluído. +- [ ] **Integração com ferramentas externas** — Ainda adicionando mais! +- [X] **Camada MCP** — Concluída. +- [X] **Camada de Skills** — Concluída. +- [X] **Comportamento proativo** — Em andamento + +--- + +## 🖥️ Modo GUI (opcional) + +O modo GUI habilita a automação de tela — o agente pode ver e interagir com um ambiente de desktop. Isso é opcional e requer configuração adicional. + +```bash +# Instalar com suporte a GUI (usando pip, sem conda) +python install.py --gui + +# Instalar com suporte a GUI e conda +python install.py --gui --conda + +# Executar em modo GUI +python run.py --gui +``` + +> [!NOTE] +> O modo GUI é experimental e requer dependências adicionais (~4GB para os pesos do modelo). Se você não precisar de automação de desktop, pule esta parte e use os modos Browser/TUI, que não têm dependências extras. + +--- + +## 📋 Referência de comandos + +### install.py + +| Flag | Descrição | +|------|-------------| +| `--gui` | Instala componentes de GUI (OmniParser) | +| `--conda` | Usa ambiente conda (opcional) | +| `--cpu-only` | Instala PyTorch apenas para CPU (com `--gui`) | + +### run.py + +| Flag | Descrição | +|------|-------------| +| (nenhum) | Executa no modo **Browser** (recomendado, requer Node.js) | +| `--tui` | Executa no modo **Terminal UI** (sem dependências) | +| `--cli` | Executa no modo **CLI** (leve) | +| `--gui` | Habilita o modo de automação GUI (requer `install.py --gui` antes) | + +### service.py + +| Comando | Descrição | +|---------|-------------| +| `install` | Instala deps, registra auto-start e inicia o CraftBot | +| `start` | Inicia o CraftBot em segundo plano | +| `stop` | Para o CraftBot | +| `restart` | Para e inicia novamente | +| `status` | Mostra o status de execução e do auto-start | +| `logs [-n N]` | Mostra as últimas N linhas do log (padrão: 50) | +| `uninstall` | Remove o registro do auto-start | + +**Exemplos de instalação:** +```bash +# Instalação simples via pip (sem conda) +python install.py + +# Com suporte a GUI (via pip, sem conda) +python install.py --gui + +# Com GUI em sistemas somente CPU (via pip, sem conda) +python install.py --gui --cpu-only + +# Com ambiente conda (recomendado para usuários de conda) +python install.py --conda + +# Com suporte a GUI e conda +python install.py --gui --conda + +# Com GUI em sistemas somente CPU com conda +python install.py --gui --conda --cpu-only +``` + +**Executando o CraftBot:** + +```powershell +# Modo Browser (padrão, requer Node.js) +python run.py + +# Modo TUI (não requer Node.js) +python run.py --tui + +# Modo CLI (leve) +python run.py --cli + +# Com modo GPU/GUI +python run.py --gui + +# Com ambiente conda +conda run -n craftbot python run.py + +# Ou usando caminho completo se o conda não estiver no PATH +&"$env:USERPROFILE\miniconda3\Scripts\conda.exe" run -n craftbot python run.py +``` + +**Linux/macOS (Bash):** +```bash +# Modo Browser (padrão, requer Node.js) +python run.py + +# Modo TUI (não requer Node.js) +python run.py --tui + +# Modo CLI (leve) +python run.py --cli + +# Com modo GPU/GUI +python run.py --gui + +# Com ambiente conda +conda run -n craftbot python run.py +``` + +### 🔧 Serviço em segundo plano (recomendado) + +Execute o CraftBot como um serviço em segundo plano para que ele continue rodando mesmo após fechar o terminal. Um atalho na área de trabalho é criado automaticamente, permitindo reabrir o navegador a qualquer momento. + +```bash +# Instala dependências, registra auto-start no login e inicia o CraftBot +python service.py install +``` + +É isso. O terminal se fecha sozinho, o CraftBot roda em segundo plano e o navegador abre automaticamente. + +```bash +# Outros comandos do serviço: +python service.py start # Inicia o CraftBot em segundo plano +python service.py status # Verifica se está em execução +python service.py stop # Para o CraftBot +python service.py restart # Reinicia o CraftBot +python service.py logs # Mostra logs recentes +``` + +| Comando | Descrição | +|---------|-------------| +| `python service.py install` | Instala dependências, registra auto-start no login, inicia o CraftBot, abre o navegador e fecha o terminal automaticamente | +| `python service.py start` | Inicia o CraftBot em segundo plano — reinicia automaticamente se já estiver rodando (o terminal se fecha sozinho) | +| `python service.py stop` | Para o CraftBot | +| `python service.py restart` | Para e inicia o CraftBot | +| `python service.py status` | Verifica se o CraftBot está rodando e se o auto-start está habilitado | +| `python service.py logs` | Mostra a saída recente do log (`-n 100` para mais linhas) | +| `python service.py uninstall` | Para o CraftBot, remove o registro de auto-start, desinstala pacotes pip e limpa o cache do pip | + +> [!TIP] +> Após `service.py start` ou `service.py install`, um **atalho do CraftBot na área de trabalho** é criado automaticamente. Se você fechar o navegador por acidente, basta clicar duas vezes no atalho para reabri-lo. + +> [!NOTE] +> **Instalação:** O instalador agora fornece orientações claras se faltarem dependências. Se o Node.js não for encontrado, você será orientado a instalá-lo ou poderá alternar para o modo TUI. A instalação detecta automaticamente a disponibilidade de GPU e recorre ao modo somente CPU quando necessário. + +> [!TIP] +> **Configuração inicial:** O CraftBot vai guiá-lo por um onboarding para configurar chaves de API, o nome do agente, MCPs e Skills. + +> [!NOTE] +> **Playwright Chromium:** Opcional para a integração com o WhatsApp Web. Se a instalação falhar, o agente continuará funcionando normalmente para outras tarefas. Instale manualmente depois com: `playwright install chromium` + +--- + +## � Solução de problemas e dúvidas comuns + +### Node.js ausente (para o modo navegador) +Se aparecer **"npm not found in PATH"** ao executar `python run.py`: +1. Baixe em [nodejs.org](https://nodejs.org/) (escolha a versão LTS) +2. Instale e reinicie o terminal +3. Execute `python run.py` novamente + +**Alternativa:** Use o modo TUI (sem necessidade de Node.js): +```bash +python run.py --tui +``` + +### A instalação falha nas dependências +O instalador agora fornece mensagens de erro detalhadas com soluções. Se a instalação falhar: +- **Verifique a versão do Python:** tenha o Python 3.10+ (`python --version`) +- **Verifique sua internet:** as dependências são baixadas durante a instalação +- **Limpe o cache do pip:** `pip install --upgrade pip` e tente de novo + +### Problemas com a instalação do Playwright +A instalação do Playwright Chromium é opcional. Se falhar: +- O agente **continuará funcionando** para outras tarefas +- Você pode pular ou instalar depois: `playwright install chromium` +- Só é necessário para a integração com o WhatsApp Web + +### Problemas com GPU/CUDA +O instalador detecta automaticamente a disponibilidade da GPU: +- Se a instalação do CUDA falhar, ele cai para o modo CPU automaticamente +- Para configuração manual de CPU: `python install.py --gui --cpu-only` + +Para uma solução detalhada, veja [INSTALLATION_FIX.md](INSTALLATION_FIX.md). + +--- + +O agente pode se conectar a diversos serviços via OAuth. As builds de release vêm com credenciais embutidas, mas você também pode usar as suas. + +### Início rápido + +Para builds de release com credenciais embutidas: +``` +/google login # Conectar ao Google Workspace +/zoom login # Conectar ao Zoom +/slack invite # Conectar ao Slack +/notion invite # Conectar ao Notion +/linkedin login # Conectar ao LinkedIn +``` + +### Detalhes do serviço + +| Serviço | Tipo de auth | Comando | Requer segredo? | +|---------|-----------|---------|------------------| +| Google | PKCE | `/google login` | Não (PKCE) | +| Zoom | PKCE | `/zoom login` | Não (PKCE) | +| Slack | OAuth 2.0 | `/slack invite` | Sim | +| Notion | OAuth 2.0 | `/notion invite` | Sim | +| LinkedIn | OAuth 2.0 | `/linkedin login` | Sim | + +### Usando suas próprias credenciais + +Se preferir usar suas próprias credenciais OAuth, adicione-as ao arquivo `.env`: + +#### Google (PKCE — apenas Client ID) +```bash +GOOGLE_CLIENT_ID=your-client-id.apps.googleusercontent.com +``` +1. Acesse o [Google Cloud Console](https://console.cloud.google.com/) +2. Habilite as APIs de Gmail, Calendar, Drive e People +3. Crie credenciais OAuth do tipo **Desktop app** +4. Copie o Client ID (o secret não é necessário com PKCE) + +#### Zoom (PKCE — apenas Client ID) +```bash +ZOOM_CLIENT_ID=your-zoom-client-id +``` +1. Acesse o [Zoom Marketplace](https://marketplace.zoom.us/) +2. Crie um app OAuth +3. Copie o Client ID + +#### Slack (requer ambos) +```bash +SLACK_SHARED_CLIENT_ID=your-slack-client-id +SLACK_SHARED_CLIENT_SECRET=your-slack-client-secret +``` +1. Acesse o [Slack API](https://api.slack.com/apps) +2. Crie um novo app +3. Adicione os escopos OAuth: `chat:write`, `channels:read`, `users:read` etc. +4. Copie o Client ID e o Client Secret + +#### Notion (requer ambos) +```bash +NOTION_SHARED_CLIENT_ID=your-notion-client-id +NOTION_SHARED_CLIENT_SECRET=your-notion-client-secret +``` +1. Acesse o [Notion Developers](https://developers.notion.com/) +2. Crie uma nova integração (Public integration) +3. Copie o OAuth Client ID e o Secret + +#### LinkedIn (requer ambos) +```bash +LINKEDIN_CLIENT_ID=your-linkedin-client-id +LINKEDIN_CLIENT_SECRET=your-linkedin-client-secret +``` +1. Acesse o [LinkedIn Developers](https://developer.linkedin.com/) +2. Crie um app +3. Adicione os escopos OAuth 2.0 +4. Copie o Client ID e o Client Secret + +--- +## Executar com contêiner + +A raiz do repositório inclui uma configuração Docker com Python 3.10, pacotes de sistema essenciais (incluindo Tesseract para OCR) e todas as dependências Python definidas em `environment.yml`/`requirements.txt`, para que o agente execute de forma consistente em ambientes isolados. + +Abaixo estão as instruções de configuração para rodar nosso agente em contêiner. + +### Construir a imagem + +Na raiz do repositório: + +```bash +docker build -t craftbot . +``` + +### Executar o contêiner + +A imagem está configurada para iniciar o agente com `python -m app.main` por padrão. Para executar interativamente: + +```bash +docker run --rm -it craftbot +``` + +Se precisar fornecer variáveis de ambiente, passe um arquivo env (por exemplo, baseado em `.env.example`): + +```bash +docker run --rm -it --env-file .env craftbot +``` + +Monte quaisquer diretórios que devam persistir fora do contêiner (como pastas de dados ou cache) usando `-v`, e ajuste portas e outras flags conforme necessário para sua implantação. A imagem traz dependências de sistema para OCR (`tesseract`), automação de tela (`pyautogui`, `mss`, utilitários X11 e framebuffer virtual) e clientes HTTP comuns, para que o agente trabalhe com arquivos, APIs de rede e automação de GUI dentro do contêiner. + +### Habilitando automação GUI/tela + +Ações GUI (eventos de mouse/teclado, capturas de tela) requerem um servidor X11. Você pode conectar-se ao display do host ou rodar em modo headless com `xvfb`: + +* Usar o display do host (requer Linux com X11): + + ```bash + docker run --rm -it + -e DISPLAY=$DISPLAY \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v $(pwd)/data:/app/app/data \ + craftbot + ``` + + Adicione montagens `-v` extras para quaisquer pastas que o agente deva ler/escrever. + +* Executar em modo headless com display virtual: + + ```bash + docker run --rm -it --env-file .env craftbot bash -lc "Xvfb :99 -screen 0 1920x1080x24 & export DISPLAY=:99 && exec python -m app.main" + ``` + +Por padrão, a imagem usa Python 3.10 e empacota as dependências Python de `environment.yml`/`requirements.txt`, portanto `python -m app.main` funciona de imediato. + +--- + +## 🤝 Como contribuir + +PRs são bem-vindos! Consulte [CONTRIBUTING.md](CONTRIBUTING.md) para o fluxo (fork → branch a partir de `dev` → PR). Todos os pull requests passam automaticamente por lint + smoke-test no CI. Para dúvidas ou uma conversa mais rápida, entre no nosso [Discord](https://discord.gg/ZN9YHc37HG) ou envie e-mail para thamyikfoong(at)craftos.net. + +## 🧾 Licença + +Este projeto está licenciado sob a [Licença MIT](LICENSE). Você é livre para usar, hospedar e monetizar este projeto (é necessário dar crédito ao projeto em caso de distribuição e monetização). + +--- + +## ⭐ Agradecimentos + +Desenvolvido e mantido por [CraftOS](https://craftos.net/) e pelos contribuidores [@zfoong](https://github.com/zfoong) e [@ahmad-ajmal](https://github.com/ahmad-ajmal). +Se o **CraftBot** é útil para você, por favor dê uma ⭐ no repositório e compartilhe com outras pessoas! diff --git a/README.zh-TW.md b/README.zh-TW.md index 31e87799..9b2a1cf5 100644 --- a/README.zh-TW.md +++ b/README.zh-TW.md @@ -27,7 +27,7 @@

- English | 日本語 | 简体中文 | 한국어 | Español + English | 日本語 | 简体中文 | 한국어 | Español | Português | Français | Deutsch

## 🚀 概覽 From 20d8e9ab35a75a780d316dc48cd8fc1e76e8c3ed Mon Sep 17 00:00:00 2001 From: zfoong Date: Tue, 21 Apr 2026 01:35:03 +0900 Subject: [PATCH 14/81] Temporary disable ci script --- .github/workflows/{ci.yml => ci.yml.disabled} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{ci.yml => ci.yml.disabled} (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml.disabled similarity index 100% rename from .github/workflows/ci.yml rename to .github/workflows/ci.yml.disabled From 8b597673dccd646c148a951018239d3eb2cab8ef Mon Sep 17 00:00:00 2001 From: Korivi Date: Tue, 21 Apr 2026 10:12:45 +0900 Subject: [PATCH 15/81] Add drag & drop and copy-paste file support directly into the chat text area - User can attach files - Drag and drop support - Copy and paste support - User can click on the attached file to preview it - User can also remove the file when not needed --- .../src/pages/Chat/ChatPage.module.css | 185 +++++++++++++++++ .../frontend/src/pages/Chat/ChatPage.tsx | 192 +++++++++++++++--- 2 files changed, 352 insertions(+), 25 deletions(-) diff --git a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css index ef210492..ccbe9c8a 100644 --- a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css +++ b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.module.css @@ -648,6 +648,13 @@ flex-direction: column; gap: var(--space-2); min-width: 0; + border-radius: var(--radius-md); + transition: outline var(--transition-fast), background var(--transition-fast); +} + +.inputWrapperDragOver { + outline: 2px dashed var(--color-primary); + background: var(--color-primary-subtle); } /* Pending attachments container */ @@ -670,6 +677,14 @@ color: var(--text-primary); } +.pendingImageThumb { + width: 20px; + height: 20px; + object-fit: cover; + border-radius: 2px; + flex-shrink: 0; +} + .pendingFileName { max-width: 120px; overflow: hidden; @@ -681,6 +696,23 @@ color: var(--text-muted); } +.pendingAttachmentBody { + display: flex; + align-items: center; + gap: 4px; + background: none; + border: none; + padding: 0; + cursor: pointer; + color: inherit; + font-size: inherit; + min-width: 0; +} + +.pendingAttachmentBody:hover .pendingFileName { + text-decoration: underline; +} + .removeAttachment { display: flex; align-items: center; @@ -698,6 +730,159 @@ color: var(--color-error); } +/* Attachment preview modal */ +.previewOverlay { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.55); + backdrop-filter: blur(8px); + display: flex; + align-items: center; + justify-content: center; + z-index: 9999; + padding: 32px; + animation: previewFadeIn 0.12s ease-out; +} + +@keyframes previewFadeIn { + from { opacity: 0; } + to { opacity: 1; } +} + +.previewModal { + background: var(--bg-secondary); + border: 1px solid var(--border-secondary); + border-radius: var(--radius-xl); + /* Size fits content — each body type controls its own dimensions */ + width: fit-content; + min-width: 320px; + max-width: min(92vw, 1100px); + max-height: 92vh; + display: flex; + flex-direction: column; + overflow: hidden; + box-shadow: 0 24px 60px rgba(0, 0, 0, 0.5); + animation: previewSlideUp 0.12s ease-out; +} + +@keyframes previewSlideUp { + from { opacity: 0; transform: translateY(8px); } + to { opacity: 1; transform: translateY(0); } +} + +.previewHeader { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 12px; + padding: 16px 20px; + border-bottom: 1px solid var(--border-primary); + min-width: 0; +} + +.previewHeaderLeft { + display: flex; + flex-direction: column; + gap: 4px; + min-width: 0; + flex: 1; +} + +.previewFileName { + font-size: var(--text-lg); + font-weight: var(--font-semibold); + color: var(--text-primary); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.previewMeta { + font-size: var(--text-xs); + color: var(--text-secondary); +} + +.previewFileSize { + font-size: var(--text-xs); + color: var(--text-secondary); +} + +.previewClose { + display: flex; + align-items: center; + justify-content: center; + width: 32px; + height: 32px; + background: none; + border: none; + cursor: pointer; + color: var(--text-muted); + border-radius: var(--radius-md); + flex-shrink: 0; + transition: background var(--transition-fast), color var(--transition-fast); +} + +.previewClose:hover { + background: var(--bg-hover); + color: var(--text-primary); +} + +/* Image — shrinks/grows to natural image dimensions */ +.previewImage { + display: block; + max-width: min(88vw, 1060px); + max-height: calc(92vh - 80px); + width: auto; + height: auto; + object-fit: contain; +} + +/* PDF — fixed comfortable reading width/height */ +.previewPdf { + width: min(860px, 88vw); + height: calc(92vh - 80px); + border: none; + background: var(--bg-primary); + display: block; +} + +/* Text — readable width, scrollable height */ +.previewTextContent { + width: min(760px, 88vw); + max-height: calc(92vh - 80px); + overflow: auto; + margin: 0; + padding: 16px 20px; + font-family: var(--font-mono); + font-size: var(--text-xs); + line-height: 1.6; + color: var(--text-primary); + background: var(--bg-primary); + white-space: pre; + min-height: 120px; + box-sizing: border-box; +} + +/* Unavailable — compact */ +.previewFileInfo { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 12px; + padding: 36px 48px; + background: var(--bg-primary); + width: min(480px, 88vw); +} + +.previewUnavailableText { + font-size: var(--text-sm); + color: var(--text-secondary); + text-align: center; + line-height: var(--leading-relaxed); + margin: 0; +} + /* Attachment error message */ .attachmentError { display: flex; diff --git a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx index 3bcb6169..a5726a2e 100644 --- a/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx +++ b/app/ui_layer/browser/frontend/src/pages/Chat/ChatPage.tsx @@ -1,4 +1,5 @@ import React, { useState, useRef, useEffect, useLayoutEffect, KeyboardEvent, useCallback, ChangeEvent, useMemo } from 'react' +import ReactDOM from 'react-dom' import { Send, Paperclip, X, Loader2, File, AlertCircle, Reply, Mic, MicOff } from 'lucide-react' import { useVirtualizer } from '@tanstack/react-virtual' import { useLocation, useNavigate } from 'react-router-dom' @@ -73,6 +74,8 @@ export function ChatPage() { const [input, setInput] = useState('') const [pendingAttachments, setPendingAttachments] = useState([]) const [attachmentError, setAttachmentError] = useState(null) + const [isDragOver, setIsDragOver] = useState(false) + const [previewAttachment, setPreviewAttachment] = useState(null) const inputRef = useRef(null) // Input history (terminal-style up/down arrow navigation) @@ -157,6 +160,15 @@ export function ChatPage() { return () => document.removeEventListener('mousedown', handler) }, [langOpen]) + // Close preview on Escape + useEffect(() => { + if (!previewAttachment) return + const handler = (e: globalThis.KeyboardEvent) => { if (e.key === 'Escape') setPreviewAttachment(null) } + document.addEventListener('keydown', handler) + return () => document.removeEventListener('keydown', handler) + }, [previewAttachment]) + + // Check if user is scrolled near the bottom const isNearBottom = useCallback(() => { const container = parentRef.current @@ -432,40 +444,31 @@ export function ChatPage() { fileInputRef.current?.click() } - const handleFileSelect = async (e: ChangeEvent) => { - const files = e.target.files - if (!files || files.length === 0) return + // Shared file processing used by file picker, paste, and drag-and-drop + const processFiles = async (files: File[]) => { + if (files.length === 0) return - // Check if adding these files would exceed the count limit const totalFileCount = pendingAttachments.length + files.length if (totalFileCount > MAX_ATTACHMENT_COUNT) { setAttachmentError(`Maximum ${MAX_ATTACHMENT_COUNT} files allowed. You have ${pendingAttachments.length} file(s) and are trying to add ${files.length} more.`) - e.target.value = '' return } const newAttachments: PendingAttachment[] = [] let newTotalSize = pendingAttachments.reduce((sum, att) => sum + att.size, 0) - for (let i = 0; i < files.length; i++) { - const file = files[i] - - // Check individual file size (for very large files, recommend manual copy) + for (const file of files) { if (file.size > MAX_TOTAL_SIZE_BYTES) { setAttachmentError(`File "${file.name}" (${formatFileSize(file.size)}) exceeds the 70MB limit. For very large files, please copy them directly to the agent workspace folder.`) - e.target.value = '' return } - // Check if adding this file would exceed total size limit if (newTotalSize + file.size > MAX_TOTAL_SIZE_BYTES) { setAttachmentError(`Adding "${file.name}" would exceed the 70MB total size limit. Current total: ${formatFileSize(newTotalSize)}. For large files, please copy them directly to the agent workspace folder.`) - e.target.value = '' return } try { - // Read file as base64 const content = await readFileAsBase64(file) newAttachments.push({ name: file.name, @@ -477,25 +480,56 @@ export function ChatPage() { } catch (error) { console.error('Failed to read file:', error) setAttachmentError(`Failed to read file "${file.name}". The file may be too large or inaccessible.`) - e.target.value = '' return } } - // Clear any previous error and add the attachments setAttachmentError(null) setPendingAttachments(prev => [...prev, ...newAttachments]) + } - // Reset file input so the same file can be selected again + const handleFileSelect = async (e: ChangeEvent) => { + const files = e.target.files + if (!files || files.length === 0) return + await processFiles(Array.from(files)) e.target.value = '' } + const handleDragOver = (e: React.DragEvent) => { + e.preventDefault() + setIsDragOver(true) + } + + const handleDragLeave = (e: React.DragEvent) => { + if (!e.currentTarget.contains(e.relatedTarget as Node)) { + setIsDragOver(false) + } + } + + const handleDrop = async (e: React.DragEvent) => { + e.preventDefault() + setIsDragOver(false) + const files = Array.from(e.dataTransfer.files) + await processFiles(files) + } + + const handlePaste = async (e: React.ClipboardEvent) => { + const files = Array.from(e.clipboardData.files) + if (files.length === 0) return + e.preventDefault() + await processFiles(files) + } + + const removeAttachment = (index: number) => { setPendingAttachments(prev => prev.filter((_, i) => i !== index)) - // Clear any error when removing files setAttachmentError(null) } + const openPreview = (att: PendingAttachment) => { + setPreviewAttachment(att) + } + // Helper to read file as base64 const readFileAsBase64 = (file: globalThis.File): Promise => { return new Promise((resolve, reject) => { @@ -511,6 +545,23 @@ export function ChatPage() { }) } + // Stable blob URL for PDF preview — only rebuilt when the attachment changes + const pdfBlobUrl = useMemo(() => { + if (!previewAttachment) return null + const isPdf = previewAttachment.type === 'application/pdf' || previewAttachment.name.toLowerCase().endsWith('.pdf') + if (!isPdf) return null + try { + const bytes = Uint8Array.from(atob(previewAttachment.content), c => c.charCodeAt(0)) + const blob = new Blob([bytes], { type: 'application/pdf' }) + return URL.createObjectURL(blob) + } catch { return null } + }, [previewAttachment]) + + // Revoke PDF blob URL when attachment changes or modal closes + useEffect(() => { + return () => { if (pdfBlobUrl) URL.revokeObjectURL(pdfBlobUrl) } + }, [pdfBlobUrl]) + // Group actions by task const tasks = actions.filter(a => a.itemType === 'task') const [selectedTaskId, setSelectedTaskId] = useState(null) @@ -520,6 +571,7 @@ export function ChatPage() { return (
+ {/* Chat Panel - flexible width */}
@@ -635,7 +687,12 @@ export function ChatPage() { )}
-
+
{/* Attachment error message */} {(attachmentError || !attachmentValidation.valid) && (
@@ -673,13 +730,27 @@ export function ChatPage() {
{pendingAttachments.map((att, idx) => (
- - - {att.name} - - - ({formatFileSize(att.size)}) - +
+ {/* Attachment preview modal — portal so it's always on top */} + {previewAttachment && ReactDOM.createPortal( + (() => { + const isImage = previewAttachment.type.startsWith('image/') + const isPdf = previewAttachment.type === 'application/pdf' || previewAttachment.name.toLowerCase().endsWith('.pdf') + const isText = !isPdf && (previewAttachment.type.startsWith('text/') || + ['application/json', 'application/xml', 'application/javascript', + 'application/typescript', 'application/yaml', 'application/toml', + 'application/csv', 'application/x-sh'].includes(previewAttachment.type) || + /\.(txt|md|csv|json|xml|yaml|yml|toml|sh|py|js|ts|jsx|tsx|css|html|htm|env|log|ini|cfg|conf)$/i.test(previewAttachment.name)) + + let textContent = '' + let lineCount = 0 + if (isText) { + try { + const bytes = Uint8Array.from(atob(previewAttachment.content), c => c.charCodeAt(0)) + textContent = new TextDecoder('utf-8').decode(bytes) + lineCount = textContent.split('\n').length + } catch { textContent = '' } + } + + return ( +
setPreviewAttachment(null)}> +
e.stopPropagation()}> + {/* Header */} +
+
+ + {previewAttachment.name} + + + {formatFileSize(previewAttachment.size)} + {isText && lineCount > 0 && <> · {lineCount} line{lineCount !== 1 ? 's' : ''}} + {isText && <> · Formatting may be inconsistent from source} + +
+ +
+ + {/* Body */} + {isImage ? ( + {previewAttachment.name} + ) : isPdf && pdfBlobUrl ? ( +