From f0edaf6649630607c6256de4e1bf2b49c86ae84d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:42:01 +0000 Subject: [PATCH 1/3] Initial plan From 7e9dae785a9632e7a4efabb63ad6dc6d73ad2014 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:53:25 +0000 Subject: [PATCH 2/3] fix: add dedicated BATCH_TRIAGE_SYSTEM_PROMPT and parameterize language in prompts - Add BATCH_TRIAGE_SYSTEM_PROMPT separate from SYSTEM_PROMPT to avoid conflicting output format instructions for batch triage - Add BATCH_TRIAGE_PROMPT with {language} parameter instead of hardcoded python - Parameterize {language} in SUGGESTION_PROMPT, SAFETY_CHECK_PROMPT, and DOCUMENTATION_PROMPT - Add evaluate_issues_batch method using the dedicated system prompt - Update generate_suggestion and generate_documentation to accept language param - Add comprehensive tests for batch triage functionality Co-authored-by: omsherikar <180152315+omsherikar@users.noreply.github.com> --- refactron/llm/orchestrator.py | 117 ++++++++++++++++++++- refactron/llm/prompts.py | 42 +++++++- tests/test_llm_batch_triage.py | 180 +++++++++++++++++++++++++++++++++ 3 files changed, 330 insertions(+), 9 deletions(-) create mode 100644 tests/test_llm_batch_triage.py diff --git a/refactron/llm/orchestrator.py b/refactron/llm/orchestrator.py index f4da49d..2608384 100644 --- a/refactron/llm/orchestrator.py +++ b/refactron/llm/orchestrator.py @@ -5,13 +5,19 @@ import os import re from pathlib import Path -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from refactron.core.models import CodeIssue, IssueCategory, IssueLevel from refactron.llm.backend_client import BackendLLMClient from refactron.llm.client import GroqClient from refactron.llm.models import RefactoringSuggestion, SuggestionStatus -from refactron.llm.prompts import DOCUMENTATION_PROMPT, SUGGESTION_PROMPT, SYSTEM_PROMPT +from refactron.llm.prompts import ( + BATCH_TRIAGE_PROMPT, + BATCH_TRIAGE_SYSTEM_PROMPT, + DOCUMENTATION_PROMPT, + SUGGESTION_PROMPT, + SYSTEM_PROMPT, +) from refactron.llm.safety import SafetyGate from refactron.rag.retriever import ContextRetriever @@ -43,12 +49,15 @@ def __init__( self.safety_gate = safety_gate or SafetyGate() - def generate_suggestion(self, issue: CodeIssue, original_code: str) -> RefactoringSuggestion: + def generate_suggestion( + self, issue: CodeIssue, original_code: str, language: str = "python" + ) -> RefactoringSuggestion: """Generate a refactoring suggestion for a code issue. Args: issue: The code issue to fix original_code: The failing code snippet + language: The programming language of the code (default: "python") Returns: A validated refactoring suggestion @@ -75,6 +84,7 @@ def generate_suggestion(self, issue: CodeIssue, original_code: str) -> Refactori severity=issue.level.value, original_code=original_code, rag_context=rag_context, + language=language, ) # 3. Call LLM @@ -156,13 +166,14 @@ def generate_suggestion(self, issue: CodeIssue, original_code: str) -> Refactori return suggestion def generate_documentation( - self, code: str, file_path: str = "unknown" + self, code: str, file_path: str = "unknown", language: str = "python" ) -> RefactoringSuggestion: """Generate documentation for the provided code. Args: code: The code to document file_path: Optional file path for context + language: The programming language of the code (default: "python") Returns: A suggestion containing the documented code @@ -189,7 +200,9 @@ def generate_documentation( rag_context = "\n\n".join(context_snippets) if context_snippets else "No context available." # 2. Construct Prompt - prompt = DOCUMENTATION_PROMPT.format(original_code=code, rag_context=rag_context) + prompt = DOCUMENTATION_PROMPT.format( + original_code=code, rag_context=rag_context, language=language + ) # 3. Call LLM try: @@ -246,6 +259,100 @@ def generate_documentation( status=SuggestionStatus.FAILED, ) + def evaluate_issues_batch( + self, + issues: List[CodeIssue], + source_code: str, + language: str = "python", + ) -> Dict[str, float]: + """Evaluate a batch of issues for a single file to suppress false positives. + + Args: + issues: List of CodeIssues found in the file + source_code: The full source code of the file + language: The programming language of the code (default: "python") + + Returns: + Dict mapping issue IDs (using rule_id or index) to confidence scores + """ + if not issues: + return {} + + # 1. Retrieve Context + context_snippets = [] + if self.retriever: + try: + results = self.retriever.retrieve_similar(source_code[:1000], top_k=3) + context_snippets = [r.content for r in results] + except Exception as e: + logger.warning(f"Context retrieval failed: {e}") + + rag_context = ( + "\n\n".join(context_snippets) if context_snippets else "No context available." + ) + + # 2. Construct JSON for issues + issues_data = {} + for i, issue in enumerate(issues): + base_id = getattr(issue, "rule_id", None) or "issue" + line_number = getattr(issue, "line_number", None) + id_parts = [str(base_id)] + if line_number is not None: + id_parts.append(str(line_number)) + id_parts.append(str(i)) + issue_id = ":".join(id_parts) + + # Ensure uniqueness in case of unexpected collisions + unique_id = issue_id + suffix = 1 + while unique_id in issues_data: + suffix += 1 + unique_id = f"{issue_id}_{suffix}" + + issues_data[unique_id] = { + "rule_id": getattr(issue, "rule_id", None), + "message": issue.message, + "line": issue.line_number, + "category": ( + issue.category.value + if hasattr(issue.category, "value") + else str(issue.category) + ), + "severity": ( + issue.level.value if hasattr(issue.level, "value") else str(issue.level) + ), + } + + # 3. Construct Prompt + prompt = BATCH_TRIAGE_PROMPT.format( + source_code=source_code, + rag_context=rag_context, + issues_json=json.dumps(issues_data, indent=2), + language=language, + ) + + # 4. Call LLM with dedicated batch triage system prompt + try: + response_text = self.client.generate( + prompt=prompt, system=BATCH_TRIAGE_SYSTEM_PROMPT, temperature=0.1 + ) + clean_text = self._clean_json_response(response_text) + data = json.loads(clean_text, strict=False) + + # Ensure we return a Dict[str, float] + result = {} + for k, v in data.items(): + try: + result[str(k)] = float(v) + except (ValueError, TypeError): + result[str(k)] = 0.5 # Fallback for parsing errors + return result + + except Exception as e: + logger.error(f"Batch triage failed: {e}") + # Fallback: return default confidence + return {str(k): 0.5 for k in issues_data.keys()} + def _clean_json_response(self, text: str) -> str: """Clean LLM response to extract JSON.""" text = text.strip() diff --git a/refactron/llm/prompts.py b/refactron/llm/prompts.py index ade8031..daf1980 100644 --- a/refactron/llm/prompts.py +++ b/refactron/llm/prompts.py @@ -27,7 +27,7 @@ Severity: {severity} Original Code: -```python +```{language} {original_code} ``` @@ -42,7 +42,7 @@ SAFETY_CHECK_PROMPT = """ Analyze the following code patch for safety risks: -```python +```{language} {proposed_code} ``` @@ -62,10 +62,10 @@ """ DOCUMENTATION_PROMPT = """ -Analyze the following Python code and generate a comprehensive MARKDOWN documentation file. +Analyze the following {language} code and generate a comprehensive MARKDOWN documentation file. Original Code: -```python +```{language} {original_code} ``` @@ -92,3 +92,37 @@ The complete Markdown documentation content including the mermaid diagram @@@END@@@ """ + +BATCH_TRIAGE_SYSTEM_PROMPT = """You are a code triage expert. +Your goal is to evaluate code issues and determine whether each is a true positive +(requiring fixing) or a false positive. + +RESPONSE FORMAT: +You must output ONLY valid JSON. +- Do not output markdown code blocks, just the raw JSON object. +- The JSON must be a flat map where keys are issue IDs (strings) and values are + confidence scores (floats between 0.0 and 1.0). +- A score of 0.0 means the issue is very likely a false positive. +- A score of 1.0 means the issue is very likely a true positive requiring a fix. +""" + +BATCH_TRIAGE_PROMPT = """ +Evaluate the following list of code issues found in a single file and determine +the confidence that each is a true positive (requiring fixing) rather than a +false positive. + +File Source Code: +```{language} +{source_code} +``` + +Relevant Context (RAG): +{rag_context} + +Issues to evaluate: +{issues_json} + +Return ONLY a JSON map where the keys are the issue IDs and the values are the +confidence scores (float between 0.0 and 1.0). +Do NOT return anything except the JSON object. +""" diff --git a/tests/test_llm_batch_triage.py b/tests/test_llm_batch_triage.py new file mode 100644 index 0000000..db0debf --- /dev/null +++ b/tests/test_llm_batch_triage.py @@ -0,0 +1,180 @@ +"""Tests for the Batched Triage & RAG Context in LLMOrchestrator.""" + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from refactron.core.models import CodeIssue, IssueCategory, IssueLevel +from refactron.llm.orchestrator import LLMOrchestrator +from refactron.llm.prompts import BATCH_TRIAGE_SYSTEM_PROMPT +from refactron.rag.retriever import ContextRetriever + + +@pytest.fixture +def mock_retriever(): + retriever = MagicMock(spec=ContextRetriever) + mock_result = MagicMock() + mock_result.content = "Some context snippet" + retriever.retrieve_similar.return_value = [mock_result] + return retriever + + +@pytest.fixture +def mock_llm_client(): + client = MagicMock() + client.model = "mock-model" + # Provide a mock JSON response for evaluate_issues_batch + client.generate.return_value = """```json +{ + "issue:10:0": 0.85, + "issue:20:1": 0.12, + "E101:30:2": 0.95 +} +```""" + return client + + +def test_evaluate_issues_batch(mock_llm_client, mock_retriever): + """Test that batch evaluation correctly parses JSON map from the LLM.""" + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + issues = [ + CodeIssue( + category=IssueCategory.COMPLEXITY, + level=IssueLevel.WARNING, + message="Too complex", + file_path=Path("test.py"), + line_number=10, + ), + CodeIssue( + category=IssueCategory.STYLE, + level=IssueLevel.INFO, + message="Line too long", + file_path=Path("test.py"), + line_number=20, + ), + CodeIssue( + category=IssueCategory.CODE_SMELL, + level=IssueLevel.WARNING, + message="Bad smell", + file_path=Path("test.py"), + line_number=30, + rule_id="E101", + ), + ] + + source_code = "def complex_function():\n pass\n" * 10 + + result = orchestrator.evaluate_issues_batch(issues, source_code) + + # Check that ContextRetriever was called for RAG Context + mock_retriever.retrieve_similar.assert_called_once() + assert "def complex_function" in mock_retriever.retrieve_similar.call_args[0][0] + + # Check JSON map parsing + assert isinstance(result, dict) + assert result.get("issue:10:0") == 0.85 + assert result.get("issue:20:1") == 0.12 + assert result.get("E101:30:2") == 0.95 + + # Ensure there's exactly 3 keys corresponding to the 3 returned mapping + assert len(result) == 3 + + +def test_evaluate_issues_batch_uses_dedicated_system_prompt(mock_llm_client, mock_retriever): + """Test that batch evaluation uses BATCH_TRIAGE_SYSTEM_PROMPT, not SYSTEM_PROMPT.""" + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + issues = [ + CodeIssue( + category=IssueCategory.STYLE, + level=IssueLevel.INFO, + message="Style issue", + file_path=Path("test.py"), + line_number=1, + ), + ] + + orchestrator.evaluate_issues_batch(issues, "x = 1") + + # Verify that the system prompt used is BATCH_TRIAGE_SYSTEM_PROMPT + call_kwargs = mock_llm_client.generate.call_args.kwargs + assert call_kwargs["system"] == BATCH_TRIAGE_SYSTEM_PROMPT + assert "code triage expert" in call_kwargs["system"] + # Ensure it does NOT contain the refactoring system prompt instructions + assert "proposed_code" not in call_kwargs["system"] + + +def test_evaluate_issues_batch_language_parameter(mock_llm_client, mock_retriever): + """Test that the language parameter is passed through to the prompt.""" + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + issues = [ + CodeIssue( + category=IssueCategory.STYLE, + level=IssueLevel.INFO, + message="Style issue", + file_path=Path("test.js"), + line_number=1, + ), + ] + + orchestrator.evaluate_issues_batch(issues, "const x = 1;", language="javascript") + + # Verify the prompt contains the language + call_kwargs = mock_llm_client.generate.call_args.kwargs + assert "```javascript" in call_kwargs["prompt"] + + +def test_evaluate_issues_batch_empty_issues(mock_llm_client, mock_retriever): + """Test batch evaluation handles empty issues correctly.""" + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + result = orchestrator.evaluate_issues_batch([], "source") + assert result == {} + mock_llm_client.generate.assert_not_called() + + +def test_evaluate_issues_batch_fallback_on_error(mock_llm_client, mock_retriever): + """Test batch evaluation handles LLM errors using a fallback mechanism.""" + mock_llm_client.generate.side_effect = Exception("LLM Error") + + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + issues = [ + CodeIssue( + category=IssueCategory.STYLE, + level=IssueLevel.INFO, + message="Line too long", + file_path=Path("test.py"), + line_number=20, + ) + ] + + result = orchestrator.evaluate_issues_batch(issues, "source") + + # It should fallback to 0.5 confidence + assert len(result) == 1 + assert all(v == 0.5 for v in result.values()) + + +def test_evaluate_issues_batch_fallback_on_bad_json(mock_llm_client, mock_retriever): + """Test batch evaluation handles invalid JSON appropriately.""" + mock_llm_client.generate.return_value = "not a json string at all" + orchestrator = LLMOrchestrator(retriever=mock_retriever, llm_client=mock_llm_client) + + issues = [ + CodeIssue( + category=IssueCategory.STYLE, + level=IssueLevel.INFO, + message="Line too long", + file_path=Path("test.py"), + line_number=20, + ) + ] + + result = orchestrator.evaluate_issues_batch(issues, "source") + + assert len(result) == 1 + assert all(v == 0.5 for v in result.values()) From f429e6a0415ee73419a61a751a83c041b6236422 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:55:48 +0000 Subject: [PATCH 3/3] fix: add warning log for issue ID collisions in evaluate_issues_batch Co-authored-by: omsherikar <180152315+omsherikar@users.noreply.github.com> --- refactron/llm/orchestrator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/refactron/llm/orchestrator.py b/refactron/llm/orchestrator.py index 2608384..42072b2 100644 --- a/refactron/llm/orchestrator.py +++ b/refactron/llm/orchestrator.py @@ -308,6 +308,9 @@ def evaluate_issues_batch( while unique_id in issues_data: suffix += 1 unique_id = f"{issue_id}_{suffix}" + logger.warning( + f"Issue ID collision detected for '{issue_id}', using '{unique_id}'" + ) issues_data[unique_id] = { "rule_id": getattr(issue, "rule_id", None),