diff --git a/pr_agent/algo/context_enrichment.py b/pr_agent/algo/context_enrichment.py index ccc3f0ce9d..a3b43f92e5 100644 --- a/pr_agent/algo/context_enrichment.py +++ b/pr_agent/algo/context_enrichment.py @@ -27,6 +27,31 @@ def append_small_file_context_to_diff( return f"{diff_text}\n\nAdditional context for small touched files:\n======\n{context}\n======" +def append_small_file_context_to_diffs( + diff_texts: list[str], + diff_files: list[FilePatchInfo], + token_handler, + *, + max_lines: int, + max_tokens: int, +) -> list[str]: + if not diff_texts: + return diff_texts + + return [ + append_small_file_context_to_diff( + diff_text, + diff_files, + token_handler, + max_lines=max_lines, + max_tokens=max_tokens, + ) + if diff_text + else diff_text + for diff_text in diff_texts + ] + + def build_small_file_context( diff_text: str, diff_files: list[FilePatchInfo], diff --git a/pr_agent/algo/suggestion_output_filter.py b/pr_agent/algo/suggestion_output_filter.py new file mode 100644 index 0000000000..5c64ddb256 --- /dev/null +++ b/pr_agent/algo/suggestion_output_filter.py @@ -0,0 +1,116 @@ +import copy +import logging + + +VALID_CONFIDENCE_VALUES = {"high", "medium", "low"} +VALID_EVIDENCE_TYPES = {"diff", "ticket", "inferred"} +FINDINGS_FILTER_MODE_OFF = "off" +FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED = "drop_low_confidence_inferred" +LOGGER = logging.getLogger(__name__) + + +def normalize_code_suggestions_output( + data: dict, + findings_metadata: bool = False, + filter_mode: str = FINDINGS_FILTER_MODE_OFF, +) -> dict: + if not isinstance(data, dict) or "code_suggestions" not in data or not isinstance(data["code_suggestions"], list): + return data + + if filter_mode != FINDINGS_FILTER_MODE_OFF and not findings_metadata: + _log_warning("findings_filter_mode has no effect when findings_metadata is false") + filter_mode = FINDINGS_FILTER_MODE_OFF + elif filter_mode not in {FINDINGS_FILTER_MODE_OFF, FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED}: + _log_warning(f"Unknown findings_filter_mode: {filter_mode!r}, defaulting to off") + filter_mode = FINDINGS_FILTER_MODE_OFF + + normalized_data = copy.deepcopy(data) + suggestions = normalized_data.get("code_suggestions") + if not isinstance(suggestions, list): + return normalized_data + + normalized_suggestions = [] + for suggestion in suggestions: + normalized_suggestion = normalize_suggestion(suggestion, findings_metadata=findings_metadata) + if normalized_suggestion: + normalized_suggestions.append(normalized_suggestion) + + filtered_suggestions, suppressed_count = filter_suggestions(normalized_suggestions, filter_mode=filter_mode) + if suppressed_count: + _log_debug("Suppressed low-confidence inferred code suggestions", artifact={"count": suppressed_count}) + normalized_data["code_suggestions"] = filtered_suggestions + + return normalized_data + + +def normalize_suggestion(suggestion: dict, findings_metadata: bool = False) -> dict | None: + if not isinstance(suggestion, dict): + return None + + normalized_suggestion = copy.deepcopy(suggestion) + + if findings_metadata: + confidence = normalize_enum_value(suggestion.get("confidence"), VALID_CONFIDENCE_VALUES) + if confidence: + normalized_suggestion["confidence"] = confidence + else: + normalized_suggestion.pop("confidence", None) + + evidence_type = normalize_enum_value(suggestion.get("evidence_type"), VALID_EVIDENCE_TYPES) + if evidence_type: + normalized_suggestion["evidence_type"] = evidence_type + else: + normalized_suggestion.pop("evidence_type", None) + else: + normalized_suggestion.pop("confidence", None) + normalized_suggestion.pop("evidence_type", None) + + return normalized_suggestion + + +def normalize_enum_value(value, valid_values: set[str]) -> str | None: + if value is None: + return None + + normalized_value = str(value).strip().lower() + if normalized_value in valid_values: + return normalized_value + + return None + + +def filter_suggestions(suggestions: list[dict], filter_mode: str = FINDINGS_FILTER_MODE_OFF) -> tuple[list[dict], int]: + if filter_mode != FINDINGS_FILTER_MODE_DROP_LOW_CONFIDENCE_INFERRED: + return suggestions, 0 + + filtered_suggestions = [] + suppressed_count = 0 + for suggestion in suggestions: + if suggestion.get("confidence") == "low" and suggestion.get("evidence_type") == "inferred": + suppressed_count += 1 + continue + filtered_suggestions.append(suggestion) + + return filtered_suggestions, suppressed_count + + +def _log_warning(message: str) -> None: + logger = _resolve_logger() + logger.warning(message) + + +def _log_debug(message: str, artifact: dict) -> None: + logger = _resolve_logger() + try: + logger.debug(message, artifact=artifact) + except TypeError: + logger.debug(message, extra={"artifact": artifact}) + + +def _resolve_logger(): + try: + from pr_agent.log import get_logger + + return get_logger() + except Exception: + return LOGGER diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index 3164631c35..541279ec3d 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -380,21 +380,43 @@ def format_review_issue_metadata(issue: dict, gfm_supported: bool, enable_badges return " ".join([f"`{label}`" for label in metadata_labels]) -def format_code_suggestion_metadata(label: str, score=None, enable_badges: bool | None = None) -> str: +def format_code_suggestion_metadata(label: str, score=None, confidence: str | None = None, + evidence_type: str | None = None, + enable_badges: bool | None = None) -> str: if enable_badges is None: - enable_badges = get_settings().pr_reviewer.get("findings_metadata_badges", False) + enable_badges = get_settings().pr_code_suggestions.get( + "findings_metadata_badges", + get_settings().pr_reviewer.get("findings_metadata_badges", False), + ) if not enable_badges: return "" + metadata_labels = [] label = str(label).strip() - if not label: - return "" + if label: + metadata_labels.append(label) score_value = "" if score is None else str(score).strip() if score_value: - return f" [{label}, importance: {score_value}]" + metadata_labels.append(f"importance: {score_value}") + + normalized_confidence = str(confidence or "").strip().lower() + if normalized_confidence in {"high", "medium", "low"}: + metadata_labels.append(f"{normalized_confidence.capitalize()} confidence") + + normalized_evidence_type = str(evidence_type or "").strip().lower() + evidence_labels = { + "diff": "Diff evidence", + "ticket": "Ticket evidence", + "inferred": "Inferred evidence", + } + if normalized_evidence_type in evidence_labels: + metadata_labels.append(evidence_labels[normalized_evidence_type]) + + if not metadata_labels: + return "" - return f" [{label}]" + return f" [{', '.join(metadata_labels)}]" def extract_relevant_lines_str(end_line, files, relevant_file, start_line, dedent=False) -> str: diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml index 36b4d0dcf6..f82351d6d5 100644 --- a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml @@ -72,6 +72,16 @@ Specific guidelines for generating code suggestions: {%- endif %} - Be aware that your input consists only of partial code segments (PR diff code), not the complete codebase. Therefore, avoid making suggestions that might duplicate existing functionality, and refrain from questioning code elements (such as variable declarations or import statements) that may be defined elsewhere in the codebase. - When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..." +{%- if small_file_context %} +- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR. +{%- endif %} +{%- if findings_metadata %} +- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence. +- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues. +{%- endif %} +- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it. +- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context. +- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing. {%- if extra_instructions %} @@ -92,6 +102,10 @@ class CodeSuggestion(BaseModel): suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise") improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.") one_sentence_summary: str = Field(description="A concise, single-sentence overview (up to 6 words) of the suggested improvement. Focus on the 'what'. Be general, and avoid method or variable names.") +{%- if findings_metadata %} + confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low") + evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred") +{%- endif %} {%- if not focus_only_on_problems %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.") {%- else %} @@ -119,6 +133,12 @@ code_suggestions: ... one_sentence_summary: | ... +{%- if findings_metadata %} + confidence: | + high + evidence_type: | + diff +{%- endif %} label: | ... ``` @@ -158,6 +178,12 @@ code_suggestions: ... one_sentence_summary: | ... +{%- if findings_metadata %} + confidence: | + high + evidence_type: | + diff +{%- endif %} label: | ... ``` diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml index 6178ee23c0..987c3780b4 100644 --- a/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml +++ b/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml @@ -61,6 +61,16 @@ Specific guidelines for generating code suggestions: - When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`). For example: "verify that `user_id` is..." - Note that you will only see partial code segments that were changed (diff hunks in a PR code), and not the entire codebase. Avoid suggestions that might duplicate existing functionality of the outer codebase. In addition, the absence of a definition, declaration, import, or initialization for any entity in the PR code is NEVER a basis for a suggestion. - Also note that if the code ends at an opening brace or statement that begins a new scope (like 'if', 'for', 'try'), don't treat it as incomplete. Instead, acknowledge the visible scope boundary and analyze only the code shown. +{%- if small_file_context %} +- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR. +{%- endif %} +{%- if findings_metadata %} +- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence. +- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues. +{%- endif %} +- For lower-severity concerns, be certain before suggesting. If you cannot confidently explain why something is a problem with a concrete scenario, do not suggest it. +- Do not speculate that a change might break other code unless you can identify the specific affected code path from the visible diff context. +- When confidence is limited but the potential impact is high, make the uncertainty explicit. Otherwise, prefer returning fewer suggestions over guessing. {%- if extra_instructions %} @@ -81,6 +91,10 @@ class CodeSuggestion(BaseModel): suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Use 2-3 short sentences.") improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.") one_sentence_summary: str = Field(description="A single-sentence overview (up to 6 words) of the suggestion. Focus on the 'what'. Be general, and avoid mentioning method or variable names.") +{%- if findings_metadata %} + confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low") + evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred") +{%- endif %} {%- if not focus_only_on_problems %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.") {%- else %} @@ -108,6 +122,12 @@ code_suggestions: ... one_sentence_summary: | ... +{%- if findings_metadata %} + confidence: | + high + evidence_type: | + diff +{%- endif %} label: | ... ``` @@ -147,6 +167,12 @@ code_suggestions: ... one_sentence_summary: | ... +{%- if findings_metadata %} + confidence: | + high + evidence_type: | + diff +{%- endif %} label: | ... ``` diff --git a/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml b/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml index f87e7be467..83bb840311 100644 --- a/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml +++ b/pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml @@ -22,6 +22,7 @@ Key guidelines for evaluation: - Extend your review beyond the specifically mentioned code lines to encompass surrounding PR code context, verifying the suggestions' contextual accuracy. - Validate the 'existing_code' field by confirming it matches or is accurately derived from code lines within a '__new hunk__' section of the PR code diff. - Ensure the 'improved_code' section accurately reflects the 'existing_code' segment after the suggested modification is applied. +- If a suggestion depends on indirect clues rather than directly visible evidence, lower confidence accordingly. Suggestions with weak inferred evidence should receive a low score. - Apply a nuanced scoring system: - Reserve high scores (9-10) for critical issues such as major bugs or security concerns (merge-blocking, won't compile, deterministic test failures, common-path crash, data loss/corruption, or clear contract breakage). @@ -82,6 +83,13 @@ __old hunk__ {%- if is_ai_metadata %} - When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or comprehensive. {%- endif %} +{%- if small_file_context %} +- Some files may include a `## Full file context:` section with the complete post-change contents of a small touched file. Use this only to validate suggestions related to the changed file, and do not report issues from untouched regions unless they are necessary to explain a defect introduced by the PR. +{%- endif %} +{%- if findings_metadata %} +- For each suggestion, set `confidence` from `high`, `medium`, or `low` based only on the visible evidence. +- Set `evidence_type` to `diff` when changed lines directly support the suggestion, `ticket` for a clear ticket mismatch, or `inferred` when the suggestion depends on indirect clues. +{%- endif %} The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions: @@ -92,6 +100,10 @@ class CodeSuggestionFeedback(BaseModel): relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the first line of the relevant 'existing code' snippet.") relevant_lines_end: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion ends (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the end of the relevant 'existing code' snippet") suggestion_score: int = Field(description="Evaluate the suggestion and assign a score from 0 to 10. Give 0 if the suggestion is wrong. For valid suggestions, score from 1 (lowest impact/importance) to 10 (highest impact/importance).") +{%- if findings_metadata %} + confidence: str = Field(description="Your confidence in this suggestion based only on the visible evidence. Allowed values: high, medium, low") + evidence_type: str = Field(description="What directly supports this suggestion. Allowed values: diff, ticket, inferred") +{%- endif %} why: str = Field(description="Briefly explain the score given in 1-2 short sentences, focusing on the suggestion's impact, relevance, and accuracy. When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`).") class PRCodeSuggestionsFeedback(BaseModel): @@ -108,6 +120,12 @@ code_suggestions: relevant_lines_start: 13 relevant_lines_end: 14 suggestion_score: 6 +{%- if findings_metadata %} + confidence: | + medium + evidence_type: | + diff +{%- endif %} why: | The variable name 't' is not descriptive enough - ... @@ -141,6 +159,12 @@ code_suggestions: relevant_lines_start: ... relevant_lines_end: ... suggestion_score: ... +{%- if findings_metadata %} + confidence: | + ... + evidence_type: | + ... +{%- endif %} why: | ... - ... diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 15acd7b803..d4e52af2fa 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -138,6 +138,12 @@ use_conversation_history=true commitable_code_suggestions = false dual_publishing_score_threshold=-1 # -1 to disable, [0-10] to set the threshold (>=) for publishing a code suggestion both in a table and as commitable focus_only_on_problems=true +findings_metadata = false +findings_metadata_badges = false +findings_filter_mode = "off" +small_file_context = false +small_file_context_max_lines = 150 +small_file_context_max_tokens = 3500 # extra_instructions = "" enable_help_text=false diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index b6e7d75edd..09eb6ab420 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -13,10 +13,12 @@ from pr_agent.algo import MAX_TOKENS from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler +from pr_agent.algo.context_enrichment import append_small_file_context_to_diffs from pr_agent.algo.git_patch_processing import decouple_and_convert_to_hunks_with_lines_numbers from pr_agent.algo.pr_processing import (add_ai_metadata_to_diff_files, get_pr_diff, get_pr_multi_diffs, retry_with_fallback_models) +from pr_agent.algo.suggestion_output_filter import normalize_code_suggestions_output from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import (ModelType, clip_tokens, format_code_suggestion_metadata, get_max_tokens, @@ -60,6 +62,9 @@ def __init__(self, pr_url: str, cli_mode=False, args: list = None, get_settings().set("config.enable_ai_metadata", False) get_logger().debug(f"AI metadata is disabled for this command") + self.findings_metadata = get_settings().pr_code_suggestions.get("findings_metadata", False) + self.small_file_context = get_settings().pr_code_suggestions.get("small_file_context", False) + self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), @@ -73,6 +78,8 @@ def __init__(self, pr_url: str, cli_mode=False, args: list = None, "relevant_best_practices": "", "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), "focus_only_on_problems": get_settings().get("pr_code_suggestions.focus_only_on_problems", False), + "findings_metadata": self.findings_metadata, + "small_file_context": self.small_file_context, "date": datetime.now().strftime('%Y-%m-%d'), 'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False), } @@ -375,6 +382,10 @@ async def _prepare_prediction(self, model: str) -> dict: disable_extra_lines=False) self.patches_diff_list = [self.patches_diff] self.patches_diff_no_line_number = self.remove_line_numbers([self.patches_diff])[0] + self.patches_diff_list_no_line_numbers = [self.patches_diff_no_line_number] + self._append_small_file_context_to_diff_lists() + self.patches_diff = self.patches_diff_list[0] + self.patches_diff_no_line_number = self.patches_diff_list_no_line_numbers[0] if self.patches_diff: get_logger().debug(f"PR diff", artifact=self.patches_diff) @@ -392,7 +403,7 @@ async def _get_prediction(self, model: str, patches_diff: str, patches_diff_no_l variables["diff_no_line_numbers"] = patches_diff_no_line_number # update diff environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(self.pr_code_suggestions_prompt_system).render(variables) - user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) + user_prompt = environment.from_string(self.pr_code_suggestions_prompt_user).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) if not get_settings().config.publish_output: @@ -420,6 +431,7 @@ async def _get_prediction(self, model: str, patches_diff: str, patches_diff_no_l suggestion["score"] = 0 suggestion["score_why"] = "" + data = self._normalize_code_suggestions_output(data) return data async def analyze_self_reflection_response(self, data, response_reflect): @@ -430,6 +442,10 @@ async def analyze_self_reflection_response(self, data, response_reflect): try: suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"] suggestion["score_why"] = code_suggestions_feedback[i]["why"] + if 'confidence' in code_suggestions_feedback[i]: + suggestion['confidence'] = code_suggestions_feedback[i]['confidence'] + if 'evidence_type' in code_suggestions_feedback[i]: + suggestion['evidence_type'] = code_suggestions_feedback[i]['evidence_type'] if 'relevant_lines_start' not in suggestion: relevant_lines_start = code_suggestions_feedback[i].get('relevant_lines_start', -1) @@ -490,7 +506,8 @@ def _truncate_if_needed(suggestion): def _prepare_pr_code_suggestions(self, predictions: str) -> Dict: data = load_yaml(predictions.strip(), - keys_fix_yaml=["relevant_file", "suggestion_content", "existing_code", "improved_code"], + keys_fix_yaml=["relevant_file", "suggestion_content", "existing_code", "improved_code", + "confidence", "evidence_type"], first_key="code_suggestions", last_key="label") if isinstance(data, list): data = {'code_suggestions': data} @@ -550,7 +567,7 @@ async def push_inline_code_suggestions(self, data): else: return self.git_provider.publish_comment('No suggestions found to improve this PR.') - findings_metadata_badges = get_settings().pr_reviewer.get("findings_metadata_badges", False) + findings_metadata_badges = get_settings().pr_code_suggestions.get("findings_metadata_badges", False) for d in data['code_suggestions']: try: @@ -569,6 +586,8 @@ async def push_inline_code_suggestions(self, data): suggestion_metadata = format_code_suggestion_metadata( label, d.get('score'), + d.get('confidence'), + d.get('evidence_type'), enable_badges=findings_metadata_badges, ) body = f"**Suggestion:** {content}{suggestion_metadata}\n```suggestion\n" + new_code_snippet + "\n```" @@ -700,6 +719,9 @@ async def prepare_prediction_main(self, model: str) -> dict: model, max_calls=get_settings().pr_code_suggestions.max_number_of_calls, add_line_numbers=True) # decouple hunk with line numbers + self.patches_diff_list_no_line_numbers = self.remove_line_numbers(self.patches_diff_list) + + self._append_small_file_context_to_diff_lists() if self.patches_diff_list: get_logger().info(f"Number of PR chunk calls: {len(self.patches_diff_list)}") @@ -740,6 +762,39 @@ async def prepare_prediction_main(self, model: str) -> dict: self.data = data = None return data + def _append_small_file_context_to_diff_lists(self): + patches_diff_list_no_line_numbers = getattr(self, "patches_diff_list_no_line_numbers", None) + if not self.small_file_context or not self.patches_diff_list or not patches_diff_list_no_line_numbers: + return + + diff_files = self.git_provider.diff_files if self.git_provider.diff_files else self.git_provider.get_diff_files() + max_lines = get_settings().pr_code_suggestions.get("small_file_context_max_lines", 150) + max_tokens = get_settings().pr_code_suggestions.get("small_file_context_max_tokens", 3500) + self.patches_diff_list = append_small_file_context_to_diffs( + self.patches_diff_list, + diff_files, + self.token_handler, + max_lines=max_lines, + max_tokens=max_tokens, + ) + self.patches_diff_list_no_line_numbers = append_small_file_context_to_diffs( + patches_diff_list_no_line_numbers, + diff_files, + self.token_handler, + max_lines=max_lines, + max_tokens=max_tokens, + ) + + def _normalize_code_suggestions_output(self, data: Dict) -> Dict: + if not data: + return data + + return normalize_code_suggestions_output( + data, + findings_metadata=self.findings_metadata, + filter_mode=get_settings().pr_code_suggestions.get("findings_filter_mode", "off"), + ) + async def convert_to_decoupled_with_line_numbers(self, patches_diff_list_no_line_numbers, model) -> List[str]: with get_logger().contextualize(sub_feature='convert_to_decoupled_with_line_numbers'): try: @@ -926,6 +981,8 @@ async def self_reflect_on_suggestions(self, 'num_code_suggestions': len(suggestion_list), 'prev_suggestions_str': prev_suggestions_str, "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), + 'findings_metadata': self.findings_metadata, + 'small_file_context': self.small_file_context, 'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False)} environment = Environment(undefined=StrictUndefined) diff --git a/tests/unittest/test_context_enrichment.py b/tests/unittest/test_context_enrichment.py index 0cfeedcf29..fc22b61f75 100644 --- a/tests/unittest/test_context_enrichment.py +++ b/tests/unittest/test_context_enrichment.py @@ -1,4 +1,9 @@ -from pr_agent.algo.context_enrichment import append_small_file_context_to_diff, extract_files_in_diff, render_full_file_context +from pr_agent.algo.context_enrichment import ( + append_small_file_context_to_diff, + append_small_file_context_to_diffs, + extract_files_in_diff, + render_full_file_context, +) from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo @@ -135,6 +140,41 @@ def test_append_small_file_context_to_diff_prefers_smaller_files_when_budget_is_ assert "## Full file context: 'src/small_a.py'" in enriched_diff assert "## Full file context: 'src/small_b.py'" in enriched_diff + def test_append_small_file_context_to_diffs_enriches_each_chunk_independently(self): + diff_texts = [ + "## File: 'src/one.py'\n\n@@ ... @@\n__new hunk__\n1 +updated = True\n", + "## File: 'src/two.py'\n\n@@ ... @@\n__new hunk__\n1 +enabled = True\n", + ] + diff_files = [ + FilePatchInfo( + base_file="updated = False\n", + head_file="updated = True\n", + patch="@@ -1 +1 @@\n-updated = False\n+updated = True\n", + filename="src/one.py", + edit_type=EDIT_TYPE.MODIFIED, + ), + FilePatchInfo( + base_file="enabled = False\n", + head_file="enabled = True\n", + patch="@@ -1 +1 @@\n-enabled = False\n+enabled = True\n", + filename="src/two.py", + edit_type=EDIT_TYPE.MODIFIED, + ), + ] + + enriched_diffs = append_small_file_context_to_diffs( + diff_texts, + diff_files, + MockTokenHandler(), + max_lines=10, + max_tokens=500, + ) + + assert "## Full file context: 'src/one.py'" in enriched_diffs[0] + assert "## Full file context: 'src/two.py'" not in enriched_diffs[0] + assert "## Full file context: 'src/two.py'" in enriched_diffs[1] + assert "## Full file context: 'src/one.py'" not in enriched_diffs[1] + def test_render_full_file_context_keeps_blank_line_numbers(self): file = FilePatchInfo( base_file="", diff --git a/tests/unittest/test_pr_code_suggestions_guardrails.py b/tests/unittest/test_pr_code_suggestions_guardrails.py new file mode 100644 index 0000000000..26cf96316b --- /dev/null +++ b/tests/unittest/test_pr_code_suggestions_guardrails.py @@ -0,0 +1,152 @@ +from types import SimpleNamespace + +import pytest + +from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions + + +class FakeSection(SimpleNamespace): + def get(self, key, default=None): + return getattr(self, key, default) + + +@pytest.mark.asyncio +async def test_analyze_self_reflection_response_carries_metadata(monkeypatch): + fake_settings = SimpleNamespace( + config=SimpleNamespace(publish_output=False), + pr_code_suggestions=FakeSection(commitable_code_suggestions=False, findings_filter_mode="off"), + ) + monkeypatch.setattr("pr_agent.tools.pr_code_suggestions.get_settings", lambda: fake_settings) + + tool = PRCodeSuggestions.__new__(PRCodeSuggestions) + tool.findings_metadata = True + tool.validate_one_liner_suggestion_not_repeating_code = lambda suggestion: suggestion + + data = { + "code_suggestions": [ + { + "relevant_file": "src/example.py", + "label": "possible issue", + "existing_code": "value = foo()", + "improved_code": "value = bar()", + "one_sentence_summary": "Tighten validation", + "suggestion_content": "Use the validated value.", + } + ] + } + response_reflect = """ +code_suggestions: +- suggestion_summary: | + Tighten validation + relevant_file: | + src/example.py + relevant_lines_start: 12 + relevant_lines_end: 13 + suggestion_score: 8 + confidence: | + HIGH + evidence_type: | + DIFF + why: | + The suggestion is directly supported by the changed lines. +""" + + await tool.analyze_self_reflection_response(data, response_reflect) + + suggestion = data["code_suggestions"][0] + assert suggestion["score"] == 8 + assert suggestion["confidence"].strip().lower() == "high" + assert suggestion["evidence_type"].strip().lower() == "diff" + assert suggestion["relevant_lines_start"] == 12 + assert suggestion["relevant_lines_end"] == 13 + + +def test_normalize_code_suggestions_output_uses_filter_mode(monkeypatch): + fake_settings = SimpleNamespace( + config=SimpleNamespace(publish_output=False), + pr_code_suggestions=FakeSection(findings_filter_mode="drop_low_confidence_inferred"), + ) + monkeypatch.setattr("pr_agent.tools.pr_code_suggestions.get_settings", lambda: fake_settings) + + tool = PRCodeSuggestions.__new__(PRCodeSuggestions) + tool.findings_metadata = True + + data = { + "code_suggestions": [ + { + "relevant_file": "src/keep.py", + "label": "possible issue", + "suggestion_content": "Keep me.", + "confidence": "medium", + "evidence_type": "diff", + }, + { + "relevant_file": "src/drop.py", + "label": "possible issue", + "suggestion_content": "Drop me.", + "confidence": "low", + "evidence_type": "inferred", + }, + ] + } + + normalized = tool._normalize_code_suggestions_output(data) + + assert [suggestion["relevant_file"] for suggestion in normalized["code_suggestions"]] == ["src/keep.py"] + + +def test_append_small_file_context_to_diff_lists_skips_when_legacy_no_line_numbers_missing(): + tool = PRCodeSuggestions.__new__(PRCodeSuggestions) + tool.small_file_context = True + tool.patches_diff_list = ["## File: 'src/example.py'"] + + tool._append_small_file_context_to_diff_lists() + + assert tool.patches_diff_list == ["## File: 'src/example.py'"] + + +@pytest.mark.asyncio +async def test_prepare_prediction_main_refreshes_no_line_chunks_after_fallback(monkeypatch): + fake_settings = SimpleNamespace( + pr_code_suggestions=FakeSection( + decouple_hunks=False, + max_number_of_calls=3, + parallel_calls=False, + suggestions_score_threshold=0, + findings_filter_mode="off", + ) + ) + monkeypatch.setattr("pr_agent.tools.pr_code_suggestions.get_settings", lambda: fake_settings) + + tool = PRCodeSuggestions.__new__(PRCodeSuggestions) + tool.git_provider = object() + tool.token_handler = object() + tool.small_file_context = False + tool.findings_metadata = False + tool.remove_line_numbers = lambda patches: [f"normalized:{patch}" for patch in patches] + + calls = [] + + async def fake_get_prediction(model, patches_diff, patches_diff_no_line_numbers): + calls.append((patches_diff, patches_diff_no_line_numbers)) + return {"code_suggestions": [{"score": 1}]} + + async def fake_convert(_patches_diff_list_no_line_numbers, _model): + return [] + + tool._get_prediction = fake_get_prediction + tool.convert_to_decoupled_with_line_numbers = fake_convert + + responses = { + False: ["no-line-chunk"], + True: ["line-chunk"], + } + + monkeypatch.setattr( + "pr_agent.tools.pr_code_suggestions.get_pr_multi_diffs", + lambda _git_provider, _token_handler, _model, max_calls, add_line_numbers: responses[add_line_numbers], + ) + + await tool.prepare_prediction_main("fake-model") + + assert calls == [("line-chunk", "normalized:line-chunk")] diff --git a/tests/unittest/test_suggestion_output_filter.py b/tests/unittest/test_suggestion_output_filter.py new file mode 100644 index 0000000000..71b589f73f --- /dev/null +++ b/tests/unittest/test_suggestion_output_filter.py @@ -0,0 +1,113 @@ +import pr_agent.algo.suggestion_output_filter as suggestion_output_filter + + +class TestSuggestionOutputFilter: + def test_normalize_code_suggestions_output_keeps_valid_metadata(self): + data = { + "code_suggestions": [ + { + "relevant_file": "src/one.py", + "label": "possible issue", + "suggestion_content": "Issue one", + "confidence": "HIGH", + "evidence_type": "DIFF", + "unknown": "preserved value", + }, + { + "relevant_file": "src/two.py", + "label": "possible issue", + "suggestion_content": "Issue two", + "confidence": "unsupported", + "evidence_type": "ticket", + }, + ] + } + + normalized = suggestion_output_filter.normalize_code_suggestions_output( + data, + findings_metadata=True, + filter_mode="off", + ) + + suggestions = normalized["code_suggestions"] + assert suggestions[0]["confidence"] == "high" + assert suggestions[0]["evidence_type"] == "diff" + assert suggestions[0]["unknown"] == "preserved value" + assert "confidence" not in suggestions[1] + assert suggestions[1]["evidence_type"] == "ticket" + + def test_normalize_code_suggestions_output_can_drop_low_confidence_inferred(self): + data = { + "code_suggestions": [ + { + "relevant_file": "src/one.py", + "label": "possible issue", + "suggestion_content": "Keep me", + "confidence": "medium", + "evidence_type": "diff", + }, + { + "relevant_file": "src/two.py", + "label": "possible issue", + "suggestion_content": "Drop me", + "confidence": "low", + "evidence_type": "inferred", + }, + ] + } + + normalized = suggestion_output_filter.normalize_code_suggestions_output( + data, + findings_metadata=True, + filter_mode="drop_low_confidence_inferred", + ) + + suggestions = normalized["code_suggestions"] + assert len(suggestions) == 1 + assert suggestions[0]["relevant_file"] == "src/one.py" + + def test_normalize_code_suggestions_output_strips_metadata_when_disabled(self): + data = { + "code_suggestions": [ + { + "relevant_file": "src/one.py", + "label": "possible issue", + "suggestion_content": "Issue one", + "confidence": "high", + "evidence_type": "diff", + } + ] + } + + normalized = suggestion_output_filter.normalize_code_suggestions_output( + data, + findings_metadata=False, + ) + + suggestion = normalized["code_suggestions"][0] + assert "confidence" not in suggestion + assert "evidence_type" not in suggestion + + def test_normalize_code_suggestions_output_warns_on_unknown_filter_mode(self, monkeypatch): + warnings = [] + data = { + "code_suggestions": [ + { + "relevant_file": "src/one.py", + "label": "possible issue", + "suggestion_content": "Keep me", + "confidence": "low", + "evidence_type": "inferred", + } + ] + } + monkeypatch.setattr(suggestion_output_filter, "_log_warning", warnings.append) + + normalized = suggestion_output_filter.normalize_code_suggestions_output( + data, + findings_metadata=True, + filter_mode="unknown_mode", + ) + + assert warnings == ["Unknown findings_filter_mode: 'unknown_mode', defaulting to off"] + assert len(normalized["code_suggestions"]) == 1