diff --git a/BENCHMARK_RESULTS.md b/BENCHMARK_RESULTS.md new file mode 100644 index 0000000..957aed6 --- /dev/null +++ b/BENCHMARK_RESULTS.md @@ -0,0 +1,188 @@ +# MCP Universe Benchmark Results + +Comparison of Claude Code agent performance across different MCP transport configurations. + +## Test Configuration + +- **Test Suite**: Repository Management (10 GitHub tasks, 48 evaluations) +- **Agent**: claude-code-agent +- **Model**: claude-opus-4-5-20251101 +- **Max Iterations**: 20 + +--- + +## Results Summary + +| Run | Transport | Passed | Failed | Score | Total Time | Notes | +|-----|-----------|--------|--------|-------|------------|-------| +| **Run 1** | Direct API (GitHub MCP via Docker) | 15 | 33 | **31.25%** | ~50min | Baseline | +| **Run 2** | ContextBridge (mcp-remote stdio) | 15 | 33 | **31.25%** | ~50min | Same as baseline | +| **Run 3** | ContextBridge (HTTP transport) | 1 | 47 | **2.08%** | ~50min | Significant regression | + +--- + +## Run 1: Direct API (Baseline) + +**Date**: 2024-12-20 07:30 +**Report**: `log/report_20251220_073022_060a7ee3-59a8-4c92-970b-5df96e9e5c81.md` +**Transport**: GitHub MCP server via Docker (stdio) + +| Task | Passed | Failed | Score | Time | +|------|--------|--------|-------|------| +| github_task_0001 | 3 | 4 | 0.43 | 111s | +| github_task_0002 | 2 | 5 | 0.29 | - | +| github_task_0003 | 2 | 8 | 0.20 | - | +| github_task_0004 | 3 | 4 | 0.43 | - | +| github_task_0005 | 3 | 4 | 0.43 | - | +| github_task_0006 | 0 | 2 | 0.00 | - | +| github_task_0007 | 1 | 1 | 0.50 | - | +| github_task_0008 | 0 | 2 | 0.00 | - | +| github_task_0009 | 0 | 2 | 0.00 | - | +| github_task_0010 | 1 | 1 | 0.50 | - | + +**Total**: 15/48 passed (31.25%) + +--- + +## Run 2: ContextBridge (mcp-remote stdio) + +**Date**: 2024-12-20 07:58 +**Report**: `log/report_20251220_075853_f9e8c86a-8599-4c75-b3ce-5ffc73b6db91.md` +**Transport**: ContextBridge via mcp-remote (stdio proxy) + +| Task | Passed | Failed | Score | Time | +|------|--------|--------|-------|------| +| github_task_0001 | 2 | 5 | 0.29 | 82s | +| github_task_0002 | 3 | 4 | 0.43 | - | +| github_task_0003 | 2 | 8 | 0.20 | - | +| github_task_0004 | 3 | 4 | 0.43 | - | +| github_task_0005 | 3 | 4 | 0.43 | - | +| github_task_0006 | 0 | 2 | 0.00 | - | +| github_task_0007 | 0 | 2 | 0.00 | - | +| github_task_0008 | 1 | 1 | 0.50 | - | +| github_task_0009 | 0 | 2 | 0.00 | - | +| github_task_0010 | 1 | 1 | 0.50 | - | + +**Total**: 15/48 passed (31.25%) + +--- + +## Run 3: ContextBridge (HTTP transport) + +**Date**: 2024-12-20 09:58 +**Report**: `log/report_20251220_095837_8ec88e24-a7d9-4a30-9e8f-e8d74c4783f3.md` +**Transport**: ContextBridge via Claude Code SDK HTTP transport + +| Task | Passed | Failed | Score | Time | +|------|--------|--------|-------|------| +| github_task_0001 | 0 | 7 | 0.00 | 100s | +| github_task_0002 | 0 | 7 | 0.00 | 12s | +| github_task_0003 | 0 | 10 | 0.00 | 225s | +| github_task_0004 | 0 | 7 | 0.00 | 153s | +| github_task_0005 | 0 | 7 | 0.00 | 565s | +| github_task_0006 | 0 | 2 | 0.00 | 405s | +| github_task_0007 | 1 | 1 | 0.50 | 395s | +| github_task_0008 | 0 | 2 | 0.00 | 52s | +| github_task_0009 | 0 | 2 | 0.00 | 305s | +| github_task_0010 | 0 | 2 | 0.00 | 567s | + +**Total**: 1/48 passed (2.08%) + +### Run 3 Failure Analysis + +Primary failure reasons: +- **"the repository doesn't exist"** - Most common failure, indicates the agent couldn't create repos via ContextBridge +- **"the branches don't exist"** - Secondary failure +- **"the file content is not found"** - Tertiary failure +- **"the PR doesn't exist"** - Downstream failure + +**Root Cause**: The Claude Code SDK HTTP transport to ContextBridge appears to have connectivity or authentication issues. The agent received the prompts but couldn't execute GitHub operations through the gateway. + +--- + +## Analysis + +### Performance Comparison + +| Metric | Run 1 (Direct) | Run 2 (mcp-remote) | Run 3 (HTTP) | +|--------|----------------|--------------------| --------------| +| Success Rate | 31.25% | 31.25% | 2.08% | +| Total Passed | 15 | 15 | 1 | +| Total Failed | 33 | 33 | 47 | +| Task 1 Latency | 111s | 82s | 100s | + +### Key Findings + +1. **Run 1 vs Run 2**: Equivalent performance + - Both achieved 31.25% success rate + - mcp-remote stdio transport works correctly with ContextBridge + - Task-level variance exists but balances out + +2. **Run 3: HTTP transport failure** + - Dramatic regression: 2.08% vs 31.25% + - Only github_task_0007 partially succeeded (1/2 evals) + - All other tasks failed to create repositories + - Suggests HTTP transport configuration or ContextBridge authentication issue + +### Potential Run 3 Issues + +1. **HTTP transport not fully supported** by Claude Code SDK for MCP +2. **Missing authentication headers** in HTTP config +3. **ContextBridge gateway** may require different authentication for HTTP vs SSE +4. **Tool discovery failure** - agent may not have received tool list from gateway + +--- + +## Known Issues + +1. **Evaluator Bug**: `IndexError` in `github__get_file_contents` (line 61 in functions.py) + - `output.content[1].resource.text` fails when content list is empty + - Affects all runs equally + +2. **LLM Call Tracking**: Reports show 0 LLM calls for claude-code-agent + - Tracking issue only, doesn't affect actual execution + +--- + +## Recommendations + +1. **Investigate HTTP transport failure** + - Check ContextBridge logs for Run 3 + - Verify HTTP authentication is working + - Consider using mcp-remote as the stable option + +2. **Fix evaluator bug** + - Add bounds checking in `github__get_file_contents` + - Would likely improve reported success rates + +3. **For production use** + - Use mcp-remote stdio transport until HTTP is debugged + - Both Run 1 and Run 2 show equivalent 31.25% success rate + +--- + +## Quick Mode Comparison (Run 4 vs Run 5) + +**Date**: 2024-12-21 + +| Transport | Task 0001 | Task 0007 | Task 0010 | Total | Score | Time | +|-----------|-----------|-----------|-----------|-------|-------|------| +| **Run 4: Direct GitHub MCP** | 0/7 | 1/2 | 0/2 | **1/11** | **9.09%** | ~2min | +| **Run 5: ContextBridge HTTP** | 0/7 | 0/2 | 0/2 | **0/11** | **0.00%** | ~2min | + +### Key Finding + +ContextBridge via HTTP transport performed worse than direct GitHub MCP: +- The agent trace shows `search_tools` as first action instead of actual GitHub operations +- Suggests tools aren't properly exposed via HTTP transport +- Authentication works (Bearer token accepted) but tool discovery/execution may be incomplete + +### ContextBridge Connection Issues Encountered + +1. **mcp-remote SSE errors** - Required Node.js 20.18.1+ (upgraded to 22) +2. **OAuth localhost callback** - ContextBridge only supports hosted callback, not localhost +3. **HTTP transport fallback** - Used direct HTTP with Bearer token from cached auth + +--- + +*Last Updated: 2024-12-21* diff --git a/claude.md b/claude.md new file mode 100644 index 0000000..742408e --- /dev/null +++ b/claude.md @@ -0,0 +1,133 @@ +# MCP Universe - Fork for MCP Gateway Testing + +## Project Overview + +This is a fork of the original MCP Universe repository, specifically created to test and evaluate an MCP gateway implementation. + +## Project Goals + +### 1. Initial Testing Phase +- **Objective**: Run repository management tests using direct Anthropic API access +- **Approach**: Use personal Anthropic API key to establish baseline performance +- **Test Suite**: Repository management benchmark (34 GitHub-related tasks) +- **Models**: Testing with Claude 4.5 models (Sonnet, Opus, Haiku) + +### 2. MCP Gateway Integration Phase +- **Objective**: Test the same benchmarks through an MCP gateway +- **Approach**: Configure the gateway URL and route requests through it +- **Purpose**: Validate gateway functionality and performance + +### 3. Comparison & Analysis Phase +- **Objective**: Compare direct API vs. gateway performance +- **Metrics to Compare**: + - Test pass/fail rates + - Response times + - Token usage + - Cost efficiency + - Error rates + - Overall reliability + +## Current Status + +**Phase**: Initial Setup +**Next Step**: Run repository management tests with direct Anthropic API + +## Implementation Plan + +### Step 1: Direct Anthropic API Testing (Current) +Detailed implementation plan saved at: `REPO_MANAGEMENT_TEST_PLAN.md` + +**Summary**: +1. Configure `.env` with `ANTHROPIC_API_KEY` and GitHub credentials +2. Update `mcpuniverse/benchmark/configs/test/repository_management.yaml`: + - Change `type: openai` → `type: claude` + - Set `model_name` to Claude 4.5 variant +3. Run benchmark: `pytest tests/benchmark/test_benchmark_repository_management.py` +4. Collect baseline metrics and results + +### Step 2: MCP Gateway Testing (Planned) +1. Configure MCP gateway URL in environment +2. Update configuration to route through gateway +3. Run the same benchmark suite +4. Collect gateway performance metrics + +### Step 3: Comparison Analysis (Planned) +1. Compare direct API vs. gateway results +2. Document performance differences +3. Identify optimization opportunities +4. Generate comprehensive comparison report + +## Repository Structure + +Key files and directories: +- `REPO_MANAGEMENT_TEST_PLAN.md` - Detailed test execution plan +- `mcpuniverse/benchmark/configs/test/` - Benchmark configurations +- `tests/benchmark/` - Benchmark test suites +- `log/` - Test execution logs and reports +- `.env` - Environment configuration (not committed) + +## Reference Documentation + +### Previous Work +- **OpenRouter Migration Plan**: `/Users/hev/.claude/plans/soft-swimming-snowflake.md` + - Documents previous effort to consolidate LLM providers + - Not currently active for this fork + +### Claude 4.5 Models +| Model | API Name | Use Case | +|-------|----------|----------| +| Sonnet 4.5 | `claude-sonnet-4-5-20250929` | Balanced performance/cost | +| Opus 4.5 | `claude-opus-4-5-20251101` | Maximum capability | +| Haiku 4.5 | `claude-haiku-4-5` | Speed/cost optimization | + +## Testing Methodology + +### Baseline Testing (Direct API) +- **Provider**: Anthropic (direct API) +- **Authentication**: `ANTHROPIC_API_KEY` +- **Configuration**: `type: claude` in YAML config +- **Benchmark**: Repository management (34 tasks) + +### Gateway Testing (Upcoming) +- **Provider**: MCP Gateway +- **Authentication**: Gateway-specific credentials +- **Configuration**: Gateway URL + model routing +- **Benchmark**: Same 34 repository management tasks + +### Comparison Metrics +1. **Functional Metrics** + - Task success rate + - Correctness of outputs + - Error handling + +2. **Performance Metrics** + - Request latency + - Total execution time + - Throughput + +3. **Cost Metrics** + - Token usage + - API costs + - Resource utilization + +4. **Reliability Metrics** + - Error rates + - Retry counts + - Failure patterns + +## Next Steps + +1. ✅ Create implementation plan (REPO_MANAGEMENT_TEST_PLAN.md) +2. ⏳ Set up environment (.env file) +3. ⏳ Run baseline tests with direct Anthropic API +4. ⏳ Document baseline results +5. ⏳ Configure MCP gateway +6. ⏳ Run gateway tests +7. ⏳ Generate comparison analysis +8. ⏳ Document findings and recommendations + +--- + +**Last Updated**: 2025-12-07 +**Primary Contact**: [Your contact info] +**Original Repository**: [Link to upstream MCP-Universe] diff --git a/mcpuniverse/agent/claude_code.py b/mcpuniverse/agent/claude_code.py index ffdae3f..f183d33 100644 --- a/mcpuniverse/agent/claude_code.py +++ b/mcpuniverse/agent/claude_code.py @@ -5,8 +5,9 @@ a simple interface for interacting with claude code SDK. """ # pylint: disable=unused-argument,broad-exception-caught -from typing import Optional, Union, Dict, List +from typing import Optional, Union, Dict, List, Any from dataclasses import dataclass, asdict, is_dataclass, field +from collections import OrderedDict from claude_code_sdk import ClaudeSDKClient, ClaudeCodeOptions from claude_code_sdk.types import ToolUseBlock, ToolResultBlock, ResultMessage from pydantic import BaseModel @@ -29,9 +30,20 @@ class ClaudeCodeConfig(BaseAgentConfig): """ Configuration class for claude-code agents. + + Attributes: + max_iterations: Maximum number of conversation turns. + model: Claude model to use (required - no default to prevent accidentally + using wrong model). Examples: "claude-opus-4-5-20251101", + "claude-sonnet-4-5-20250929", "claude-haiku-4-5". + + Note: + Agent configs use 'model' while LLM configs use 'model_name'. This is + intentional - agents wrap SDK clients while LLM configs are for direct + API access. The naming follows OpenAI Agent SDK conventions. """ max_iterations: int = 10 - model: str = "claude-opus-4-5-20251101" + model: str = "" # Required - must be explicitly set in config disallowed_tools: list = field(default_factory=lambda: [ "Bash", "Edit", "MultiEdit", "Write", "NotebookEdit", "TodoWrite", "KillBash"]) @@ -57,16 +69,107 @@ def __init__( **kwargs ): """ - Initialize a BasicAgent instance. + Initialize a ClaudeCodeAgent instance. Args: - llm (BaseLLM): The LLM to be used by this agent. + mcp_manager (MCPManager): The MCP manager for server connections. config (Optional[Union[Dict, str]]): Agent configuration, either as a dictionary or a string reference to a predefined configuration. **kwargs: Additional keyword arguments for agent initialization. """ super().__init__(mcp_manager=mcp_manager, llm=None, config=config) + if not self._config.model: + raise ValueError( + "ClaudeCodeConfig.model is required. Please specify a model in your config, " + "e.g., 'claude-opus-4-5-20251101', 'claude-sonnet-4-5-20250929', or 'claude-haiku-4-5'" + ) self._logger = get_logger(f"{self.__class__.__name__}:{self._name}") + # HTTP server configs (passed directly to Claude Code SDK, no MCP client needed) + self._http_server_configs: Dict[str, Dict[str, Any]] = {} + + async def initialize(self, mcp_servers: Optional[List[dict]] = None): + """ + Initialize MCP clients and HTTP server configurations. + + For HTTP transport servers (like ContextBridge), configs are stored directly + and passed to Claude Code SDK without building MCP clients. The SDK handles + the HTTP connection and authentication internally. + + Args: + mcp_servers (List[dict], optional): A list of MCP servers. + """ + if self._initialized: + return + + if mcp_servers is None: + mcp_servers = self._config.servers + + self._mcp_clients = OrderedDict() + self._http_server_configs = {} + self._tools = {} + + for server in mcp_servers: + server_name = server["name"] + transport = server.get("transport", "stdio") + + # Check if this server has HTTP config in the MCP manager + if self._mcp_manager: + server_config = self._mcp_manager.get_config(server_name) + if server_config.http.url: + # HTTP transport: store config directly, Claude Code SDK will handle it + self._http_server_configs[server_name] = { + "type": "http", + "url": server_config.http.url + } + if server_config.http.headers: + self._http_server_configs[server_name]["headers"] = server_config.http.headers + self._logger.info("Configured HTTP MCP server: %s -> %s", + server_name, server_config.http.url) + continue + + # For stdio/sse servers, build MCP client as usual + client = await self._mcp_manager.build_client( + server_name, + transport=transport, + permissions=server.get("permissions", None) + ) + client.project_id = self._project_id + self._mcp_clients[server_name] = client + + # Get tools for stdio/sse servers + tools = await self._mcp_clients[server_name].list_tools() + selected_tools = server.get("tools", None) + if selected_tools is None: + self._tools[server_name] = tools + else: + self._tools[server_name] = [tool for tool in tools if tool.name in selected_tools] + + await self._initialize() + self._initialized = True + + def get_mcp_configs(self) -> Dict[str, Dict[str, Any]]: + """ + Retrieve MCP configurations for Claude Code SDK. + + Returns configurations from both: + - Connected MCP clients (stdio/sse) + - HTTP servers (stored directly, no client needed) + + Returns: + Dict[str, Dict[str, Any]]: Server name to MCP config mapping. + """ + configs = {} + + # Add configs from connected clients + for name, client in self._mcp_clients.items(): + config = client.get_mcp_config() + if config: + configs[name] = config + + # Add HTTP server configs (these don't need MCP client connection) + configs.update(self._http_server_configs) + + return configs async def _execute( self, diff --git a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0001.json b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0001.json index cdb3d6c..93b0468 100644 --- a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0001.json +++ b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0001.json @@ -4,7 +4,7 @@ "use_specified_server": true, "mcp_servers": [ { - "name": "github" + "name": "contextbridge" } ], "evaluators": [ @@ -89,7 +89,7 @@ ], "cleanups": [ { - "server": "github", + "server": "contextbridge", "tool": "create_repository", "cleanup_func": "delete_repository", "cleanup_args": { diff --git a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0007.json b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0007.json index a3ffdb7..113e50d 100644 --- a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0007.json +++ b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0007.json @@ -4,7 +4,7 @@ "use_specified_server": true, "mcp_servers": [ { - "name": "github" + "name": "contextbridge" } ], "evaluators": [ @@ -32,7 +32,7 @@ ], "cleanups": [ { - "server": "github", + "server": "contextbridge", "tool": "create_repository", "cleanup_func": "delete_repository", "cleanup_args": { diff --git a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0010.json b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0010.json index c077545..e77e3d9 100644 --- a/mcpuniverse/benchmark/configs/test/repository_management/github_task_0010.json +++ b/mcpuniverse/benchmark/configs/test/repository_management/github_task_0010.json @@ -4,7 +4,7 @@ "use_specified_server": true, "mcp_servers": [ { - "name": "github" + "name": "contextbridge" } ], "evaluators": [ @@ -31,7 +31,7 @@ ], "cleanups": [ { - "server": "github", + "server": "contextbridge", "tool": "create_repository", "cleanup_func": "delete_repository", "cleanup_args": { diff --git a/mcpuniverse/benchmark/configs/test/repository_management_claude_code.yaml b/mcpuniverse/benchmark/configs/test/repository_management_claude_code.yaml index 1bdeeff..a854cfd 100644 --- a/mcpuniverse/benchmark/configs/test/repository_management_claude_code.yaml +++ b/mcpuniverse/benchmark/configs/test/repository_management_claude_code.yaml @@ -1,3 +1,11 @@ +kind: llm +spec: + name: llm-1 + type: claude + config: + model_name: claude-opus-4-5-20251101 + +--- kind: agent spec: name: claude-code-agent @@ -5,14 +13,14 @@ spec: config: instruction: You are an agent for repository management. model: claude-opus-4-5-20251101 - max_iterations: 20 + max_iterations: 10 servers: - name: github --- kind: benchmark spec: - description: Test Claude Code agent for github tasks with Opus 4.5. + description: Full benchmark run - 10 tasks, 10 iterations agent: claude-code-agent tasks: - test/repository_management/github_task_0001.json @@ -25,26 +33,3 @@ spec: - test/repository_management/github_task_0008.json - test/repository_management/github_task_0009.json - test/repository_management/github_task_0010.json - - test/repository_management/github_task_0011.json - - test/repository_management/github_task_0012.json - - test/repository_management/github_task_0014.json - - test/repository_management/github_task_0015.json - - test/repository_management/github_task_0016.json - - test/repository_management/github_task_0017.json - - test/repository_management/github_task_0018.json - - test/repository_management/github_task_0019.json - - test/repository_management/github_task_0021.json - - test/repository_management/github_task_0022.json - - test/repository_management/github_task_0023.json - - test/repository_management/github_task_0024.json - - test/repository_management/github_task_0025.json - - test/repository_management/github_task_0026.json - - test/repository_management/github_task_0027.json - - test/repository_management/github_task_0028.json - - test/repository_management/github_task_0029.json - - test/repository_management/github_task_0030.json - - test/multi_server/multi-server_task_playwright_github_0001.json - - test/multi_server/multi-server_task_playwright_github_0002.json - - test/multi_server/multi-server_task_playwright_github_0003.json - - test/multi_server/multi-server_task_playwright_github_0004.json - - test/multi_server/multi-server_task_playwright_github_0005.json diff --git a/mcpuniverse/benchmark/configs/test/repository_management_claude_code_quick.yaml b/mcpuniverse/benchmark/configs/test/repository_management_claude_code_quick.yaml new file mode 100644 index 0000000..bbbb581 --- /dev/null +++ b/mcpuniverse/benchmark/configs/test/repository_management_claude_code_quick.yaml @@ -0,0 +1,28 @@ +kind: llm +spec: + name: llm-1 + type: claude + config: + model_name: claude-opus-4-5-20251101 + +--- +kind: agent +spec: + name: claude-code-agent + type: claude-code + config: + instruction: You are an agent for repository management. + model: claude-opus-4-5-20251101 + max_iterations: 5 + servers: + - name: contextbridge + +--- +kind: benchmark +spec: + description: Quick validation run - 3 tasks, 5 iterations + agent: claude-code-agent + tasks: + - test/repository_management/github_task_0001.json + - test/repository_management/github_task_0007.json + - test/repository_management/github_task_0010.json diff --git a/mcpuniverse/mcp/config.py b/mcpuniverse/mcp/config.py index f00b52a..b2ec08b 100644 --- a/mcpuniverse/mcp/config.py +++ b/mcpuniverse/mcp/config.py @@ -60,21 +60,39 @@ def list_unspecified_params(self) -> List[str]: return [arg for arg in self.args if re.findall(r"\{\{.*?\}\}", "".join(arg.strip().split()))] +@dataclass +class HttpConfig(BaseConfig): + """ + Configuration class for HTTP transport. + + This class represents the configuration for an HTTP-based MCP server, + typically used for remote MCP gateways like ContextBridge. + + Attributes: + url (str): The URL of the HTTP MCP server. + headers (Dict): Optional HTTP headers. + """ + url: str = "" + headers: Dict = field(default_factory=dict) + + @dataclass class ServerConfig(BaseConfig): """ Configuration class for an MCP server. This class represents the complete configuration for an MCP server, - including standard I/O, SSE, and environment configurations. + including standard I/O, SSE, HTTP, and environment configurations. Attributes: stdio (CommandConfig): Configuration for standard I/O command. sse (CommandConfig): Configuration for SSE command. + http (HttpConfig): Configuration for HTTP transport. env (Dict): Dictionary of environment variables. """ stdio: CommandConfig = field(default_factory=CommandConfig) sse: CommandConfig = field(default_factory=CommandConfig) + http: HttpConfig = field(default_factory=HttpConfig) env: Dict = field(default_factory=dict) def render_template(self, params: Optional[Dict] = None): diff --git a/mcpuniverse/mcp/configs/server_list.json b/mcpuniverse/mcp/configs/server_list.json index 4f9ff3c..70b3300 100644 --- a/mcpuniverse/mcp/configs/server_list.json +++ b/mcpuniverse/mcp/configs/server_list.json @@ -284,5 +284,14 @@ "https://mcp.paypal.com/sse" ] } + }, + + "contextbridge": { + "http": { + "url": "https://gateway.contextbridge.ai/mcp", + "headers": { + "Authorization": "Bearer {{CONTEXTBRIDGE_API_KEY}}" + } + } } } \ No newline at end of file diff --git a/mcpuniverse/mcp/configs/server_list_example.json b/mcpuniverse/mcp/configs/server_list_example.json new file mode 100644 index 0000000..e084202 --- /dev/null +++ b/mcpuniverse/mcp/configs/server_list_example.json @@ -0,0 +1,52 @@ +{ + "_comment": "Example MCP server configurations. Copy entries to server_list.json and replace template variables.", + "_docs": { + "transport_types": { + "stdio": "Standard I/O transport - runs command locally", + "sse": "Server-Sent Events transport - runs as HTTP server on specified port", + "http": "HTTP transport - connects to remote MCP gateway" + }, + "template_variables": "Use {{VARIABLE_NAME}} syntax. Values are loaded from environment or .env file." + }, + + "example_stdio_server": { + "stdio": { + "command": "python3", + "args": ["-m", "your_mcp_server"] + }, + "env": { + "API_KEY": "{{YOUR_API_KEY}}" + } + }, + + "example_sse_server": { + "sse": { + "command": "python3", + "args": [ + "-m", "your_mcp_server", + "--transport", "sse", + "--port", "{{PORT}}" + ] + } + }, + + "example_http_gateway": { + "_comment": "HTTP transport for remote MCP gateways like ContextBridge", + "http": { + "url": "https://your-gateway.example.com/mcp", + "headers": { + "Authorization": "Bearer {{YOUR_GATEWAY_API_KEY}}" + } + } + }, + + "contextbridge": { + "_comment": "ContextBridge MCP Gateway - set CONTEXTBRIDGE_API_KEY in your environment", + "http": { + "url": "https://gateway.contextbridge.ai/mcp", + "headers": { + "Authorization": "Bearer {{CONTEXTBRIDGE_API_KEY}}" + } + } + } +} diff --git a/tests/benchmark/test_benchmark_repository_management_claude_code.py b/tests/benchmark/test_benchmark_repository_management_claude_code.py index 61363d2..cf1b9ba 100644 --- a/tests/benchmark/test_benchmark_repository_management_claude_code.py +++ b/tests/benchmark/test_benchmark_repository_management_claude_code.py @@ -1,3 +1,4 @@ +import os import unittest import pytest from mcpuniverse.tracer.collectors import FileCollector @@ -6,11 +7,30 @@ from mcpuniverse.callbacks.handlers.vprint import get_vprint_callbacks +# Set BENCHMARK_MODE=quick for fast validation (3 tasks, 5 iterations) +# Set BENCHMARK_MODE=full for complete benchmark (10 tasks, 10 iterations) +# Default: quick +BENCHMARK_MODE = os.environ.get("BENCHMARK_MODE", "quick") + +CONFIG_FILES = { + "quick": "test/repository_management_claude_code_quick.yaml", + "full": "test/repository_management_claude_code.yaml", +} + + class TestBenchmarkRunnerClaudeCode(unittest.IsolatedAsyncioTestCase): async def test(self): - trace_collector = FileCollector(log_file="log/repository_management_claude_code.log") - benchmark = BenchmarkRunner("test/repository_management_claude_code.yaml") + config_file = CONFIG_FILES.get(BENCHMARK_MODE, CONFIG_FILES["quick"]) + log_suffix = f"_{BENCHMARK_MODE}" if BENCHMARK_MODE != "full" else "" + + print(f"\n{'='*66}") + print(f"Running benchmark in {BENCHMARK_MODE.upper()} mode") + print(f"Config: {config_file}") + print(f"{'='*66}\n") + + trace_collector = FileCollector(log_file=f"log/repository_management_claude_code{log_suffix}.log") + benchmark = BenchmarkRunner(config_file) results = await benchmark.run( trace_collector=trace_collector, callbacks=get_vprint_callbacks() diff --git a/tests/data/config/claude_code.yaml b/tests/data/config/claude_code.yaml index 12f2440..b41cae4 100644 --- a/tests/data/config/claude_code.yaml +++ b/tests/data/config/claude_code.yaml @@ -4,5 +4,6 @@ spec: type: claude-code config: instruction: You are an agent for weather forecast + model: claude-sonnet-4-5-20250929 servers: - name: weather \ No newline at end of file